@SuppressWarnings("unchecked")
@Override
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request)
throws YarnRemoteException {
NodeStatus remoteNodeStatus = request.getNodeStatus();
/**
* Here is the node heartbeat sequence...
* 1. Check if it's a registered node
* 2. Check if it's a valid (i.e. not excluded) node
* 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
* 4. Send healthStatus to RMNode
*/
NodeId nodeId = remoteNodeStatus.getNodeId();
// 1. Check if it's a registered node
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
if (rmNode == null) {
/* node does not exist */
LOG.info("Node not found rebooting " + remoteNodeStatus.getNodeId());
return reboot;
}
// Send ping
this.nmLivelinessMonitor.receivedPing(nodeId);
// 2. Check if it's a valid (i.e. not excluded) node
if (!this.nodesListManager.isValidNode(rmNode.getHostName())) {
LOG.info("Disallowed NodeManager nodeId: " + nodeId + " hostname: "
+ rmNode.getNodeAddress());
this.rmContext.getDispatcher().getEventHandler().handle(
new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION));
return shutDown;
}
NodeHeartbeatResponse nodeHeartBeatResponse = recordFactory
.newRecordInstance(NodeHeartbeatResponse.class);
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
HeartbeatResponse lastHeartbeatResponse = rmNode.getLastHeartBeatResponse();
if (remoteNodeStatus.getResponseId() + 1 == lastHeartbeatResponse
.getResponseId()) {
LOG.info("Received duplicate heartbeat from node "
+ rmNode.getNodeAddress());
nodeHeartBeatResponse.setHeartbeatResponse(lastHeartbeatResponse);
return nodeHeartBeatResponse;
} else if (remoteNodeStatus.getResponseId() + 1 < lastHeartbeatResponse
.getResponseId()) {
LOG.info("Too far behind rm response id:"
+ lastHeartbeatResponse.getResponseId() + " nm response id:"
+ remoteNodeStatus.getResponseId());
// TODO: Just sending reboot is not enough. Think more.
this.rmContext.getDispatcher().getEventHandler().handle(
new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING));
return reboot;
}
// Heartbeat response
HeartbeatResponse latestResponse = recordFactory
.newRecordInstance(HeartbeatResponse.class);
latestResponse.setResponseId(lastHeartbeatResponse.getResponseId() + 1);
latestResponse.addAllContainersToCleanup(rmNode.getContainersToCleanUp());
latestResponse.addAllApplicationsToCleanup(rmNode.getAppsToCleanup());
latestResponse.setNodeAction(NodeAction.NORMAL);
// Check if node's masterKey needs to be updated and if the currentKey has
// roller over, send it across
if (isSecurityEnabled()) {
boolean shouldSendMasterKey = false;
MasterKey nextMasterKeyForNode =
this.containerTokenSecretManager.getNextKey();
if (nextMasterKeyForNode != null) {
// nextMasterKeyForNode can be null if there is no outstanding key that
// is in the activation period.
MasterKey nodeKnownMasterKey = request.getLastKnownMasterKey();
if (nodeKnownMasterKey.getKeyId() != nextMasterKeyForNode.getKeyId()) {
shouldSendMasterKey = true;
}
}
if (shouldSendMasterKey) {
latestResponse.setMasterKey(nextMasterKeyForNode);
}
}
// 4. Send status to RMNode, saving the latest response.
this.rmContext.getDispatcher().getEventHandler().handle(
new RMNodeStatusEvent(nodeId, remoteNodeStatus.getNodeHealthStatus(),
remoteNodeStatus.getContainersStatuses(),
remoteNodeStatus.getKeepAliveApplications(), latestResponse));
nodeHeartBeatResponse.setHeartbeatResponse(latestResponse);
return nodeHeartBeatResponse;
}