nm1.registerNode();
nm2.registerNode(); // nm2 will not heartbeat with RM1
// create app that will not be saved because it will finish
RMApp app0 = rm1.submitApp(200);
RMAppAttempt attempt0 = app0.getCurrentAppAttempt();
// spot check that app is saved
Assert.assertEquals(1, rmAppState.size());
nm1.nodeHeartbeat(true);
MockAM am0 = rm1.sendAMLaunched(attempt0.getAppAttemptId());
am0.registerAppAttempt();
am0.unregisterAppAttempt();
nm1.nodeHeartbeat(attempt0.getAppAttemptId(), 1, ContainerState.COMPLETE);
am0.waitForState(RMAppAttemptState.FINISHED);
rm1.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
// spot check that app is not saved anymore
Assert.assertEquals(0, rmAppState.size());
// create app that gets launched and does allocate before RM restart
RMApp app1 = rm1.submitApp(200);
// assert app1 info is saved
ApplicationState appState = rmAppState.get(app1.getApplicationId());
Assert.assertNotNull(appState);
Assert.assertEquals(0, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext()
.getApplicationId(), app1.getApplicationSubmissionContext()
.getApplicationId());
//kick the scheduling to allocate AM container
nm1.nodeHeartbeat(true);
// assert app1 attempt is saved
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
Assert.assertEquals(1, appState.getAttemptCount());
ApplicationAttemptState attemptState =
appState.getAttempt(attemptId1);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1),
attemptState.getMasterContainer().getId());
// launch the AM
MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
// AM request for containers
am1.allocate("127.0.0.1" , 1000, 1, new ArrayList<ContainerId>());
// kick the scheduler
nm1.nodeHeartbeat(true);
List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// create app that does not get launched by RM before RM restart
RMApp app2 = rm1.submitApp(200);
// assert app2 info is saved
appState = rmAppState.get(app2.getApplicationId());
Assert.assertNotNull(appState);
Assert.assertEquals(0, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext()
.getApplicationId(), app2.getApplicationSubmissionContext()
.getApplicationId());
// create unmanaged app
RMApp appUnmanaged = rm1.submitApp(200, "someApp", "someUser", null, true,
null, conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS), null);
ApplicationAttemptId unmanagedAttemptId =
appUnmanaged.getCurrentAppAttempt().getAppAttemptId();
// assert appUnmanaged info is saved
ApplicationId unmanagedAppId = appUnmanaged.getApplicationId();
appState = rmAppState.get(unmanagedAppId);
Assert.assertNotNull(appState);
// wait for attempt to reach LAUNCHED state
rm1.waitForState(unmanagedAttemptId, RMAppAttemptState.LAUNCHED);
rm1.waitForState(unmanagedAppId, RMAppState.ACCEPTED);
// assert unmanaged attempt info is saved
Assert.assertEquals(1, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext()
.getApplicationId(), appUnmanaged.getApplicationSubmissionContext()
.getApplicationId());
// PHASE 2: create new RM and start from old state
// create new RM to represent restart and recover state
MockRM rm2 = new MockRM(conf, memStore);
// start new RM
rm2.start();
// change NM to point to new RM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm2.setResourceTrackerService(rm2.getResourceTrackerService());
// verify load of old state
// only 2 apps are loaded since unmanaged app is not loaded back since it
// cannot be restarted by the RM this will change with work preserving RM
// restart in which AMs/NMs are not rebooted
Assert.assertEquals(2, rm2.getRMContext().getRMApps().size());
// verify correct number of attempts and other data
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
Assert.assertNotNull(loadedApp1);
//Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
Assert.assertEquals(app1.getApplicationSubmissionContext()
.getApplicationId(), loadedApp1.getApplicationSubmissionContext()
.getApplicationId());
RMApp loadedApp2 = rm2.getRMContext().getRMApps().get(app2.getApplicationId());
Assert.assertNotNull(loadedApp2);
//Assert.assertEquals(0, loadedApp2.getAppAttempts().size());
Assert.assertEquals(app2.getApplicationSubmissionContext()
.getApplicationId(), loadedApp2.getApplicationSubmissionContext()
.getApplicationId());
// verify state machine kicked into expected states
rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
rm2.waitForState(loadedApp2.getApplicationId(), RMAppState.ACCEPTED);
// verify new attempts created
Assert.assertEquals(2, loadedApp1.getAppAttempts().size());
Assert.assertEquals(1, loadedApp2.getAppAttempts().size());
// verify old AM is not accepted
// change running AM to talk to new RM
am1.setAMRMProtocol(rm2.getApplicationMasterService());
AllocateResponse allocResponse = am1.allocate(
new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>());
Assert.assertTrue(allocResponse.getAMCommand() == AMCommand.AM_RESYNC);
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
hbResponse = nm2.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = rm2.registerNode("127.0.0.1:1234", 15120);
nm2 = rm2.registerNode("127.0.0.2:5678", 15120);
// verify no more reboot response sent
hbResponse = nm1.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
hbResponse = nm2.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
// assert app1 attempt is saved
attempt1 = loadedApp1.getCurrentAppAttempt();
attemptId1 = attempt1.getAppAttemptId();
rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
appState = rmAppState.get(loadedApp1.getApplicationId());
attemptState = appState.getAttempt(attemptId1);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1),
attemptState.getMasterContainer().getId());
// Nodes on which the AM's run
MockNM am1Node = nm1;
if (attemptState.getMasterContainer().getNodeId().toString()
.contains("127.0.0.2")) {
am1Node = nm2;
}
// assert app2 attempt is saved
RMAppAttempt attempt2 = loadedApp2.getCurrentAppAttempt();
ApplicationAttemptId attemptId2 = attempt2.getAppAttemptId();
rm2.waitForState(attemptId2, RMAppAttemptState.ALLOCATED);
appState = rmAppState.get(loadedApp2.getApplicationId());
attemptState = appState.getAttempt(attemptId2);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId2, 1),
attemptState.getMasterContainer().getId());
MockNM am2Node = nm1;
if (attemptState.getMasterContainer().getNodeId().toString()
.contains("127.0.0.2")) {
am2Node = nm2;
}
// start the AM's
am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
MockAM am2 = rm2.sendAMLaunched(attempt2.getAppAttemptId());
am2.registerAppAttempt();
//request for containers
am1.allocate("127.0.0.1" , 1000, 3, new ArrayList<ContainerId>());
am2.allocate("127.0.0.2" , 1000, 1, new ArrayList<ContainerId>());
// verify container allocate continues to work
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(true);
conts = am1.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// finish the AM's
am1.unregisterAppAttempt();
rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.FINISHING);
am1Node.nodeHeartbeat(attempt1.getAppAttemptId(), 1, ContainerState.COMPLETE);
am1.waitForState(RMAppAttemptState.FINISHED);
am2.unregisterAppAttempt();
rm2.waitForState(loadedApp2.getApplicationId(), RMAppState.FINISHING);
am2Node.nodeHeartbeat(attempt2.getAppAttemptId(), 1, ContainerState.COMPLETE);
am2.waitForState(RMAppAttemptState.FINISHED);
// stop RM's
rm2.stop();
rm1.stop();