}
private void peerScan() throws ActiveFencingException {
Date cutTime = DateUtil.currentGMTTime();
Profiler profiler = new Profiler();
profiler.start();
Profiler profilerQueryActiveList = new Profiler();
profilerQueryActiveList.start();
List<ManagementServerHostVO> currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
profilerQueryActiveList.stop();
Profiler profilerSyncClusterInfo = new Profiler();
profilerSyncClusterInfo.start();
List<ManagementServerHostVO> removedNodeList = new ArrayList<ManagementServerHostVO>();
List<ManagementServerHostVO> invalidatedNodeList = new ArrayList<ManagementServerHostVO>();
if(_mshostId != null) {
if(_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain situation";
s_logger.error(msg);
throw new ActiveFencingException(msg);
}
// only if we have already attached to cluster, will we start to check leaving nodes
for(Map.Entry<Long, ManagementServerHostVO> entry : _activePeers.entrySet()) {
ManagementServerHostVO current = getInListById(entry.getKey(), currentList);
if(current == null) {
if(entry.getKey().longValue() != _mshostId.longValue()) {
if(s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node left, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
}
removedNodeList.add(entry.getValue());
}
} else {
if(current.getRunid() == 0) {
if(entry.getKey().longValue() != _mshostId.longValue()) {
if(s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node left because of invalidated session, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
}
invalidatedNodeList.add(entry.getValue());
}
} else {
if(entry.getValue().getRunid() != current.getRunid()) {
if(s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node left and rejoined quickly, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
}
entry.getValue().setRunid(current.getRunid());
}
}
}
}
}
profilerSyncClusterInfo.stop();
Profiler profilerInvalidatedNodeList = new Profiler();
profilerInvalidatedNodeList.start();
// process invalidated node list
if(invalidatedNodeList.size() > 0) {
for(ManagementServerHostVO mshost : invalidatedNodeList) {
_activePeers.remove(mshost.getId());
try {
JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
} catch(Exception e) {
s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
}
}
this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, invalidatedNodeList));
}
profilerInvalidatedNodeList.stop();
Profiler profilerRemovedList = new Profiler();
profilerRemovedList.start();
// process removed node list
Iterator<ManagementServerHostVO> it = removedNodeList.iterator();
while(it.hasNext()) {
ManagementServerHostVO mshost = it.next();
if(!pingManagementNode(mshost)) {
s_logger.warn("Management node " + mshost.getId() + " is detected inactive by timestamp and also not pingable");
_activePeers.remove(mshost.getId());
try {
JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
} catch(Exception e) {
s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
}
} else {
s_logger.info("Management node " + mshost.getId() + " is detected inactive by timestamp but is pingable");
it.remove();
}
}
if(removedNodeList.size() > 0) {
this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, removedNodeList));
}
profilerRemovedList.stop();
List<ManagementServerHostVO> newNodeList = new ArrayList<ManagementServerHostVO>();
for(ManagementServerHostVO mshost : currentList) {
if(!_activePeers.containsKey(mshost.getId())) {
_activePeers.put(mshost.getId(), mshost);
if(s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node joined, id:" + mshost.getId() + ", nodeIP:" + mshost.getServiceIP());
}
newNodeList.add(mshost);
try {
JmxUtil.registerMBean("ClusterManager", "Node " + mshost.getId(), new ClusterManagerMBeanImpl(this, mshost));
} catch(Exception e) {
s_logger.warn("Unable to regiester cluster node into JMX monitoring due to exception " + ExceptionUtil.toString(e));
}
}
}
if(newNodeList.size() > 0) {
this.queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded, newNodeList));
}
profiler.stop();
if(profiler.getDuration() >= this._heartbeatInterval) {
if(s_logger.isDebugEnabled())
s_logger.debug("Peer scan takes too long to finish. profiler: " + profiler.toString()
+ ", profilerQueryActiveList: " + profilerQueryActiveList.toString()
+ ", profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString()
+ ", profilerInvalidatedNodeList: " + profilerInvalidatedNodeList.toString()
+ ", profilerRemovedList: " + profilerRemovedList.toString());
}
}