// Step 1.b, start a DN with slow heartbeat, so that we can know for sure it
// will be chosen as the target of excess replica during recommission.
hdfsConf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 30);
cluster.startDataNodes(hdfsConf, 1, true, null, null, null);
DataNode lastDN = cluster.getDataNodes().get(3);
lastDN.getDatanodeUuid();
// Step 2, decommission the first DN at both ANN and SBN.
DataNode firstDN = cluster.getDataNodes().get(0);
// Step 2.a, ask ANN to decomm the first DN
DatanodeInfo decommissionedNodeFromANN = decommissionNode(
0, firstDN.getDatanodeUuid(), null, AdminStates.DECOMMISSIONED);
// Step 2.b, ask SBN to decomm the first DN
DatanodeInfo decomNodeFromSBN = decommissionNode(1, firstDN.getDatanodeUuid(), null,
AdminStates.DECOMMISSIONED);
// Step 3, recommission the first DN on SBN and ANN to create excess replica
// It recommissions the node on SBN first to create potential
// inconsistent state. In production cluster, such insistent state can happen
// even if recommission command was issued on ANN first given the async nature
// of the system.
// Step 3.a, ask SBN to recomm the first DN.
// SBN has been fixed so that it no longer invalidates excess replica during
// recommission.
// Before the fix, SBN could get into the following state.
// 1. the last DN would have been chosen as excess replica, given its
// heartbeat is considered old.
// Please refer to BlockPlacementPolicyDefault#chooseReplicaToDelete
// 2. After recomissionNode finishes, SBN has 3 live replicas ( 0, 1, 2 )
// and one excess replica ( 3 )
// After the fix,
// After recomissionNode finishes, SBN has 4 live replicas ( 0, 1, 2, 3 )
Thread.sleep(slowHeartbeatDNwaitTime);
recomissionNode(1, decomNodeFromSBN);
// Step 3.b, ask ANN to recommission the first DN.
// To verify the fix, the test makes sure the excess replica picked by ANN
// is different from the one picked by SBN before the fix.
// To achieve that, we make sure next-to-last DN is chosen as excess replica
// by ANN.
// 1. restore LastDNprop's heartbeat interval.
// 2. Make next-to-last DN's heartbeat slow.
MiniDFSCluster.DataNodeProperties LastDNprop = cluster.stopDataNode(3);
LastDNprop.conf.setLong(
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, HEARTBEAT_INTERVAL);
cluster.restartDataNode(LastDNprop);
MiniDFSCluster.DataNodeProperties nextToLastDNprop = cluster.stopDataNode(2);
nextToLastDNprop.conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 30);
cluster.restartDataNode(nextToLastDNprop);
cluster.waitActive();
Thread.sleep(slowHeartbeatDNwaitTime);
recomissionNode(0, decommissionedNodeFromANN);
// Step 3.c, make sure the DN has deleted the block and report to NNs
cluster.triggerHeartbeats();
HATestUtil.waitForDNDeletions(cluster);
cluster.triggerDeletionReports();
// Step 4, decommission the first DN on both ANN and SBN
// With the fix to make sure SBN no longer marks excess replica
// during recommission, SBN's decommission can finish properly
decommissionNode(0, firstDN.getDatanodeUuid(), null,
AdminStates.DECOMMISSIONED);
// Ask SBN to decomm the first DN
decommissionNode(1, firstDN.getDatanodeUuid(), null,
AdminStates.DECOMMISSIONED);
cluster.shutdown();
}