* @param status the node that has just completed
* @return NodeCompletionResult
*/
public synchronized NodeCompletionResult onCompletedNode(YarnConfiguration amConf,
ContainerStatus status) {
ContainerId containerId = status.getContainerId();
NodeCompletionResult result = new NodeCompletionResult();
RoleInstance roleInstance;
if (containersBeingReleased.containsKey(containerId)) {
log.info("Container was queued for release");
Container container = containersBeingReleased.remove(containerId);
RoleStatus roleStatus = lookupRoleStatus(container);
log.info("decrementing role count for role {}", roleStatus.getName());
roleStatus.decReleasing();
roleStatus.decActual();
roleStatus.incCompleted();
roleHistory.onReleaseCompleted(container);
} else if (surplusNodes.remove(containerId)) {
//its a surplus one being purged
result.surplusNode = true;
} else {
//a container has failed
result.containerFailed = true;
roleInstance = activeContainers.remove(containerId);
if (roleInstance != null) {
//it was active, move it to failed
incFailedCountainerCount();
failedNodes.put(containerId, roleInstance);
} else {
// the container may have been noted as failed already, so look
// it up
roleInstance = failedNodes.get(containerId);
}
if (roleInstance != null) {
int roleId = roleInstance.roleId;
log.info("Failed container in role {}", roleId);
try {
RoleStatus roleStatus = lookupRoleStatus(roleId);
roleStatus.decActual();
boolean shortLived = isShortLived(roleInstance);
String message;
if (roleInstance.container != null) {
String user = null;
try {
user = SliderUtils.getCurrentUser().getShortUserName();
} catch (IOException ignored) {
}
String completedLogsUrl = null;
Container c = roleInstance.container;
String url = null;
if (amConf != null) {
url = amConf.get(YarnConfiguration.YARN_LOG_SERVER_URL);
}
if (user != null && url != null) {
completedLogsUrl = url
+ "/" + c.getNodeId() + "/" + roleInstance.getContainerId() + "/ctx/" + user;
}
message = String.format("Failure %s on host %s" +
(completedLogsUrl != null ? ", see %s" : ""), roleInstance.getContainerId(),
c.getNodeId().getHost(), completedLogsUrl);
} else {
message = String.format("Failure %s",
containerId.toString());
}
roleStatus.noteFailed(message);
//have a look to see if it short lived
if (shortLived) {
roleStatus.incStartFailed();
}
if (roleInstance.container != null) {
roleHistory.onFailedContainer(roleInstance.container, shortLived);
}
} catch (YarnRuntimeException e1) {
log.error("Failed container of unknown role {}", roleId);
}
} else {
//this isn't a known container.
log.error("Notified of completed container {} that is not in the list" +
" of active or failed containers", containerId);
completionOfUnknownContainerEvent.incrementAndGet();
}
}
if (result.surplusNode) {
//a surplus node
return result;
}
//record the complete node's details; this pulls it from the livenode set
//remove the node
ContainerId id = status.getContainerId();
RoleInstance node = getLiveNodes().remove(id);
if (node == null) {
log.warn("Received notification of completion of unknown node {}", id);
completionOfNodeNotInLiveListEvent.incrementAndGet();