* Calculate the total capacity of all the datanodes. Sleep for
* three seconds to be sure the datanodes have had a chance to
* heartbeat their capacities.
*/
Thread.sleep(WAIT_FOR_HEARTBEATS);
final DatanodeManager dm = cluster.getNamesystem().getBlockManager(
).getDatanodeManager();
final long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
File dn3Vol1 = new File(dataDir, "data"+(2*2+1));
File dn3Vol2 = new File(dataDir, "data"+(2*2+2));
/*
* Make the 1st volume directories on the first two datanodes
* non-accessible. We don't make all three 1st volume directories
* readonly since that would cause the entire pipeline to
* fail. The client does not retry failed nodes even though
* perhaps they could succeed because just a single volume failed.
*/
assertTrue("Couldn't chmod local vol", dn1Vol1.setExecutable(false));
assertTrue("Couldn't chmod local vol", dn2Vol1.setExecutable(false));
/*
* Create file1 and wait for 3 replicas (ie all DNs can still
* store a block). Then assert that all DNs are up, despite the
* volume failures.
*/
Path file1 = new Path("/test1");
DFSTestUtil.createFile(fs, file1, 1024, (short)3, 1L);
DFSTestUtil.waitReplication(fs, file1, (short)3);
ArrayList<DataNode> dns = cluster.getDataNodes();
assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());
/*
* The metrics should confirm the volume failures.
*/
assertCounter("VolumeFailures", 1L,
getMetrics(dns.get(0).getMetrics().name()));
assertCounter("VolumeFailures", 1L,
getMetrics(dns.get(1).getMetrics().name()));
assertCounter("VolumeFailures", 0L,
getMetrics(dns.get(2).getMetrics().name()));
// Ensure we wait a sufficient amount of time
assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
// Eventually the NN should report two volume failures
DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2,
origCapacity - (1*dnCapacity), WAIT_FOR_HEARTBEATS);
/*
* Now fail a volume on the third datanode. We should be able to get
* three replicas since we've already identified the other failures.
*/
assertTrue("Couldn't chmod local vol", dn3Vol1.setExecutable(false));
Path file2 = new Path("/test2");
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
DFSTestUtil.waitReplication(fs, file2, (short)3);
assertTrue("DN3 should still be up", dns.get(2).isDatanodeUp());
assertCounter("VolumeFailures", 1L,
getMetrics(dns.get(2).getMetrics().name()));
ArrayList<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
ArrayList<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
dm.fetchDatanodes(live, dead, false);
live.clear();
dead.clear();
dm.fetchDatanodes(live, dead, false);
assertEquals("DN3 should have 1 failed volume",
1, live.get(2).getVolumeFailures());
/*
* Once the datanodes have a chance to heartbeat their new capacity the