conf.setLong("dfs.heartbeat.interval", 30L);
conf.setBoolean("dfs.replication.considerLoad", false);
Random random = new Random();
FileSystem fs = null;
DFSClient dfsClient = null;
LocatedBlocks blocks = null;
int replicaCount = 0;
int rand = random.nextInt(numDataNodes);
MiniDFSCluster cluster = new MiniDFSCluster(conf, numDataNodes, true, null);
cluster.waitActive();
fs = cluster.getFileSystem();
Path file1 = new Path("/tmp/testBlockCorruptRecovery/file");
DFSTestUtil.createFile(fs, file1, 1024, numReplicas, 0);
Block blk = DFSTestUtil.getFirstBlock(fs, file1);
String block = blk.getBlockName();
dfsClient = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()), conf);
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
replicaCount = blocks.get(0).getLocations().length;
// Wait until block is replicated to numReplicas
while (replicaCount != numReplicas) {
try {
LOG.info("Looping until expected replicaCount of " + numReplicas +
"is reached");
Thread.sleep(1000);
} catch (InterruptedException ignore) {
}
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
replicaCount = blocks.get(0).getLocations().length;
}
assertTrue(blocks.get(0).isCorrupt() == false);
// Corrupt numCorruptReplicas replicas of block
int[] corruptReplicasDNIDs = new int[numCorruptReplicas];
for (int i=0, j=0; (j != numCorruptReplicas) && (i < numDataNodes); i++) {
if (corruptReplica(block, i))
corruptReplicasDNIDs[j++] = i;
}
// Restart the datanodes containing corrupt replicas
// so they would be reported to namenode and re-replicated
for (int i =0; i < numCorruptReplicas; i++)
cluster.restartDataNode(corruptReplicasDNIDs[i]);
// Loop until all corrupt replicas are reported
int corruptReplicaSize = cluster.getNameNode().namesystem.
corruptReplicas.numCorruptReplicas(blk);
while (corruptReplicaSize != numCorruptReplicas) {
try {
IOUtils.copyBytes(fs.open(file1), new IOUtils.NullOutputStream(),
conf, true);
} catch (IOException e) {
}
try {
LOG.info("Looping until expected " + numCorruptReplicas + " are " +
"reported. Current reported " + corruptReplicaSize);
Thread.sleep(1000);
} catch (InterruptedException ignore) {
}
corruptReplicaSize = cluster.getNameNode().namesystem.
corruptReplicas.numCorruptReplicas(blk);
}
// Loop until the block recovers after replication
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
replicaCount = blocks.get(0).getLocations().length;
while (replicaCount != numReplicas) {
try {
LOG.info("Looping until block gets rereplicated to " + numReplicas);
Thread.sleep(1000);
} catch (InterruptedException ignore) {
}
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
replicaCount = blocks.get(0).getLocations().length;
}
// Make sure the corrupt replica is invalidated and removed from
// corruptReplicasMap
corruptReplicaSize = cluster.getNameNode().namesystem.
corruptReplicas.numCorruptReplicas(blk);
while (corruptReplicaSize != 0 || replicaCount != numReplicas) {
try {
LOG.info("Looping until corrupt replica is invalidated");
Thread.sleep(1000);
} catch (InterruptedException ignore) {
}
corruptReplicaSize = cluster.getNameNode().namesystem.
corruptReplicas.numCorruptReplicas(blk);
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
replicaCount = blocks.get(0).getLocations().length;
}
// Make sure block is healthy
assertTrue(corruptReplicaSize == 0);
assertTrue(replicaCount == numReplicas);
assertTrue(blocks.get(0).isCorrupt() == false);
cluster.shutdown();
}