private Runnable getHeartbeatTask() {
return new Runnable() {
@Override
public void run() {
Transaction txn = Transaction.open("ClusterHeartBeat");
try {
Profiler profiler = new Profiler();
Profiler profilerHeartbeatUpdate = new Profiler();
Profiler profilerPeerScan = new Profiler();
Profiler profilerAgentLB = new Profiler();
try {
profiler.start();
profilerHeartbeatUpdate.start();
txn.transitToUserManagedConnection(getHeartbeatConnection());
if(s_logger.isTraceEnabled()) {
s_logger.trace("Cluster manager heartbeat update, id:" + _mshostId);
}
_mshostDao.update(_mshostId, getCurrentRunId(), DateUtil.currentGMTTime());
profilerHeartbeatUpdate.stop();
profilerPeerScan.start();
if (s_logger.isTraceEnabled()) {
s_logger.trace("Cluster manager peer-scan, id:" + _mshostId);
}
if (!_peerScanInited) {
_peerScanInited = true;
initPeerScan();
}
peerScan();
profilerPeerScan.stop();
profilerAgentLB.start();
//initiate agent lb task will be scheduled and executed only once, and only when number of agents loaded exceeds _connectedAgentsThreshold
if (_agentLBEnabled && !_agentLbHappened) {
SearchCriteriaService<HostVO, HostVO> sc = SearchCriteria2.create(HostVO.class);
sc.addAnd(sc.getEntity().getManagementServerId(), Op.NNULL);
sc.addAnd(sc.getEntity().getType(), Op.EQ, Host.Type.Routing);
List<HostVO> allManagedRoutingAgents = sc.list();
sc = SearchCriteria2.create(HostVO.class);
sc.addAnd(sc.getEntity().getType(), Op.EQ, Host.Type.Routing);
List<HostVO> allAgents = sc.list();
double allHostsCount = allAgents.size();
double managedHostsCount = allManagedRoutingAgents.size();
if (allHostsCount > 0.0) {
double load = managedHostsCount/allHostsCount;
if (load >= _connectedAgentsThreshold) {
s_logger.debug("Scheduling agent rebalancing task as the average agent load " + load + " is more than the threshold " + _connectedAgentsThreshold);
_rebalanceService.scheduleRebalanceAgents();
_agentLbHappened = true;
} else {
s_logger.trace("Not scheduling agent rebalancing task as the averages load " + load + " is less than the threshold " + _connectedAgentsThreshold);
}
}
}
profilerAgentLB.stop();
} finally {
profiler.stop();
if(profiler.getDuration() >= _heartbeatInterval) {
if(s_logger.isDebugEnabled())
s_logger.debug("Management server heartbeat takes too long to finish. profiler: " + profiler.toString() +
", profilerHeartbeatUpdate: " + profilerHeartbeatUpdate.toString() +
", profilerPeerScan: " + profilerPeerScan.toString() +
", profilerAgentLB: " + profilerAgentLB.toString());
}
}
} catch(CloudRuntimeException e) {
s_logger.error("Runtime DB exception ", e.getCause());
if(e.getCause() instanceof ClusterInvalidSessionException) {
s_logger.error("Invalid cluster session found, fence it");
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
}
if(isRootCauseConnectionRelated(e.getCause())) {
s_logger.error("DB communication problem detected, fence it");
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
}
invalidHeartbeatConnection();
} catch(ActiveFencingException e) {
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
} catch (Throwable e) {
s_logger.error("Unexpected exception in cluster heartbeat", e);
if(isRootCauseConnectionRelated(e.getCause())) {
s_logger.error("DB communication problem detected, fence it");
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
}
invalidHeartbeatConnection();
} finally {
txn.close("ClusterHeartBeat");
}
}
};
}