private void performFailover(Integer failedNodeID) throws Exception
{
log.info("JBoss Messaging is failing over for failed node " + failedNodeID +
". If there are many messages to reload this may take some time...");
ClusterNotification notification = new ClusterNotification(ClusterNotification.TYPE_FAILOVER_START, failedNodeID.intValue(), null);
clusterNotifier.sendNotification(notification);
log.debug(this + " announced it is starting failover procedure");
pm.mergeTransactions(failedNodeID.intValue(), thisNodeID);
synchronized (failoverLock)
{
// Need to lock
boolean intr = Thread.interrupted();
for (;;)
{
try
{
lock.writeLock().acquire();
break;
}
catch (InterruptedException ex)
{
intr = true;
}
}
try
{
Map nameMap = (Map)nameMaps.get(failedNodeID);
List toRemove = new ArrayList();
if (nameMap != null)
{
Iterator iter = nameMap.values().iterator();
while (iter.hasNext())
{
Binding binding = (Binding)iter.next();
Queue queue = binding.queue;
if (queue.isRecoverable() && queue.getNodeID() == failedNodeID.intValue())
{
toRemove.add(binding);
}
}
}
Iterator iter = toRemove.iterator();
while (iter.hasNext())
{
Binding binding = (Binding)iter.next();
Condition condition = binding.condition;
Queue queue = binding.queue;
// Sanity check
if (!queue.isRecoverable())
{
throw new IllegalStateException("Found non recoverable queue " + queue.getName() +
" in map, these should have been removed!");
}
// Sanity check
if (!queue.isClustered())
{
throw new IllegalStateException("Queue " + queue.getName() + " is not clustered!");
}
// Remove from the in-memory map - no need to broadcast anything - they will get removed from other nodes
// in memory
// maps when the other nodes detect failure
removeBindingInMemory(binding.queue.getNodeID(), binding.queue.getName());
// Find if there is a local queue with the same name
Queue localQueue = null;
if (localNameMap != null)
{
Binding b = (Binding)localNameMap.get(queue.getName());
if (b != null)
{
localQueue = b.queue;
}
}
if (localQueue != null)
{
// need to merge the queues
log.debug(this + " has already a queue: " + queue.getName() + " queue so merging queues");
localQueue.mergeIn(queue.getChannelID(), failedNodeID.intValue());
log.debug("Merged queue");
// Delete from storage
// Note we must do this *after* we have done any merge.
// This is because if we did it first, then the merge failed, we'd be left with the old channel
// deleted
// but the messages would have still be in the old channel
// meaning they would have disappeared from the users point of view and it would involve manual
// database intervention to correct it
// See http://jira.jboss.com/jira/browse/JBMESSAGING-1113
deleteBindingFromStorage(queue);
log.debug(this + " deleted binding for " + queue.getName());
}
else
{
// Cannot failover if there is no queue deployed.
log.warn("Cannot failover " + queue.getName() +
" since it does not exist on this node. " +
"You must deploy your clustered destinations on ALL nodes of the cluster");
}
// Note we do not need to send an unbind request across the cluster - this is because
// when the node crashes a view change will hit the other nodes and that will cause
// all binding data for that node to be removed anyway.
}
log.debug(this + ": server side fail over is now complete");
}
finally
{
lock.writeLock().release();
if (intr)
Thread.currentThread().interrupt();
}
// Now clean the data for the failed node
// TODO - does this need to be inside the lock above?
cleanDataForNode(failedNodeID);
if (!keepOldFailoverModel)
{
notification = new ClusterNotification(ClusterNotification.TYPE_NODE_FAILEDOVER,
failedNodeID.intValue(),
null);
clusterNotifier.sendNotification(notification);
}
}
log.debug(this + " announcing that failover procedure is complete");
notification = new ClusterNotification(ClusterNotification.TYPE_FAILOVER_END, failedNodeID.intValue(), null);
clusterNotifier.sendNotification(notification);
//for testing only
sendJMXNotification(FAILOVER_COMPLETED_NOTIFICATION);