package io.fathom.cloud.blobs.replicated;
import io.fathom.cloud.blobs.BlobData;
import io.fathom.cloud.blobs.BlobStore;
import io.fathom.cloud.protobuf.CloudCommons.FixReplica;
import io.fathom.cloud.protobuf.CloudCommons.PeerRequest;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fathomdb.TimeSpan;
import com.fathomdb.utils.Hex;
import com.google.common.base.Joiner;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.protobuf.ByteString;
public class ReplicaRepair {
private static final Logger log = LoggerFactory.getLogger(ReplicaRepair.class);
final StorageCluster cluster;
final String blobStoreKey;
public ReplicaRepair(StorageCluster cluster, String blobStoreKey) {
super();
this.cluster = cluster;
this.blobStoreKey = blobStoreKey;
}
final Map<StorageNode, PeerRequest.Builder> peerRequests = Maps.newHashMap();
PeerRequest.Builder getPeerRequest(StorageNode node) {
PeerRequest.Builder peerRequest = peerRequests.get(node);
if (peerRequest == null) {
peerRequest = PeerRequest.newBuilder();
peerRequests.put(node, peerRequest);
}
return peerRequest;
}
public void repair(String prefix) {
HashMultimap<ByteString, StorageNode> keyMap = HashMultimap.create();
{
List<StorageNode> queue = Lists.newArrayList(cluster.ring.all());
// TODO: Run in parallel?
for (int attempt = 1; attempt <= 3; attempt++) {
List<StorageNode> retry = Lists.newArrayList();
for (StorageNode node : queue) {
try {
Iterable<ByteString> keys = node.getBlobStore(blobStoreKey).listWithPrefix(prefix);
// TODO: intern strings
// TODO: use a smarter data structure??
// TODO: These aren't really strings; they're actually
// hex
// of MD5s
for (ByteString key : keys) {
keyMap.put(key, node);
}
} catch (IOException e) {
log.warn("Failed to list storage node " + node, e);
retry.add(node);
}
}
queue = retry;
if (queue.isEmpty()) {
break;
}
TimeSpan.FIVE_SECONDS.doSafeSleep();
}
if (!queue.isEmpty()) {
log.warn("Some storage nodes were not reachable; treating as down: " + Joiner.on(";").join(queue));
}
}
// TODO: Throttle replication traffic
// -------------------------------------------------
// Copy any blobs without sufficient replicas
// -------------------------------------------------
// We copy anything with 1 replica first, etc...
for (int i = 1; i < cluster.dataReplicaCount; i++) {
for (ByteString key : keyMap.keySet()) {
Set<StorageNode> nodes = keyMap.get(key);
if (nodes.size() != i) {
continue;
}
log.info("Node under-replicated: {} count={}", forDebug(key), i);
Set<StorageNode> replicated = replicate(key, nodes);
if (replicated.size() < cluster.dataReplicaCount) {
log.warn("Unable to copy to sufficient replicas: {}", key);
}
// TODO: Update map??
}
}
// -------------------------------------------------
// Check out any over-replicated blobs
// -------------------------------------------------
// TODO: Drop thread priority now?
for (ByteString key : keyMap.keySet()) {
Set<StorageNode> nodes = keyMap.get(key);
if (nodes.size() <= cluster.dataReplicaCount) {
continue;
}
log.warn("Node over-replicated: {} count={}", key);
Set<StorageNode> correct = Sets.newHashSet();
Set<StorageNode> shouldRemove = Sets.newHashSet();
Set<StorageNode> shouldAdd = Sets.newHashSet();
Iterator<StorageNode> walkRing = cluster.ring.walkRing(key);
while (walkRing.hasNext()) {
StorageNode node = walkRing.next();
if (nodes.contains(node)) {
correct.add(node);
} else {
shouldAdd.add(node);
}
if ((correct.size() + shouldAdd.size()) > cluster.dataReplicaCount) {
break;
}
}
for (StorageNode node : nodes) {
if (!correct.contains(node)) {
shouldRemove.add(node);
}
}
// TODO: Remove by moving to cache
// TODO: Don't move stuff around if it's a node that's down...
log.warn("Should remove: {} add: {}", Joiner.on(";").join(shouldRemove), Joiner.on(";").join(shouldAdd));
}
// -------------------------------------------------
// Find blobs on the wrong nodes
// -------------------------------------------------
// TODO: We don't do this now; we rely on
log.warn("Blob moving to correct nodes is not implemented");
flushPeerRequests();
}
private void flushPeerRequests() {
for (Entry<StorageNode, PeerRequest.Builder> entry : peerRequests.entrySet()) {
StorageNode node = entry.getKey();
PeerRequest.Builder prb = entry.getValue();
PeerRequest pr = prb.build();
try {
node.requestExecutor.execute(pr.toByteArray());
} catch (IOException e) {
log.error("Error enqueuing peer request", e);
}
}
peerRequests.clear();
}
private String forDebug(ByteString key) {
return Hex.toHex(key.toByteArray());
}
private Set<StorageNode> replicate(ByteString key, Set<StorageNode> nodes) {
Iterator<StorageNode> walkRing = cluster.ring.walkRing(key);
Set<StorageNode> yes = Sets.newHashSet(nodes);
Set<StorageNode> no = Sets.newHashSet();
// TODO: Run in parallel?
while (walkRing.hasNext()) {
StorageNode node = walkRing.next();
if (yes.contains(node)) {
continue;
}
if (no.contains(node)) {
continue;
}
try {
copy(key, yes, node);
yes.add(node);
if (yes.size() >= cluster.dataReplicaCount) {
return yes;
}
} catch (IOException e) {
log.warn("Failed to copy to node " + node, e);
no.add(node);
}
}
return yes;
}
private void copy(ByteString key, Set<StorageNode> src, StorageNode dest) throws IOException {
PeerRequest.Builder peerRequest = getPeerRequest(dest);
FixReplica.Builder frb = peerRequest.addFixReplicaBuilder();
frb.setBlobStoreKey(blobStoreKey);
frb.addBlobKey(key);
}
public void fixReplicate(BlobStore global, BlobStore local, ByteString key) throws IOException {
if (local.has(key, false)) {
return;
}
Iterator<StorageNode> walkRing = cluster.ring.walkRing(key);
// Regardless of what the state was when the request was sent, if the
// first N nodes now have the blob, then don't replicate
// TODO: Check first if we're one of the N nodes; if so, just copy if we
// don't have it
// TODO: If any of the N nodes don't have it, early-exit the loop
Set<StorageNode> yes = Sets.newHashSet();
Set<StorageNode> no = Sets.newHashSet();
// TODO: Run in parallel?
while (walkRing.hasNext()) {
StorageNode node = walkRing.next();
if (yes.contains(node)) {
continue;
}
if (no.contains(node)) {
continue;
}
try {
if (node.getBlobStore(blobStoreKey).has(key, false)) {
yes.add(node);
} else {
no.add(node);
}
} catch (IOException e) {
log.warn("Error communicating with node " + node, e);
no.add(node);
}
if ((yes.size() + no.size()) >= cluster.dataReplicaCount) {
break;
}
}
if (yes.size() >= cluster.dataReplicaCount) {
return;
}
BlobData data = null;
try {
for (StorageNode node : yes) {
try {
data = node.getBlobStore(blobStoreKey).find(key);
break;
} catch (IOException e) {
log.warn("Error communicating with node " + node, e);
}
}
if (data == null) {
data = global.find(key);
}
if (data == null) {
log.error("Unable to find blob: {}", forDebug(key));
return;
}
// TODO: This won't scale to big data sizes..
log.info("Writing replica on {}: {}", local, forDebug(key));
local.put(data);
} finally {
if (data != null) {
data.close();
}
}
}
}