/*
* Copyright 2007-2010 Sun Microsystems, Inc.
*
* This file is part of Project Darkstar Server.
*
* Project Darkstar Server is free software: you can redistribute it
* and/or modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation and
* distributed hereunder to you.
*
* Project Darkstar Server is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* --
*/
package com.sun.sgs.impl.service.nodemap.affinity.dlpa;
import com.sun.sgs.auth.Identity;
import com.sun.sgs.impl.service.nodemap.affinity.AffinityGroup;
import com.sun.sgs.impl.service.nodemap.affinity.LPAAffinityGroupFinder;
import
com.sun.sgs.impl.service.nodemap.affinity.AffinityGroupFinderFailedException;
import com.sun.sgs.impl.service.nodemap.affinity.AffinityGroupFinderStats;
import com.sun.sgs.impl.service.nodemap.affinity.BasicState;
import com.sun.sgs.impl.service.nodemap.affinity.RelocatingAffinityGroup;
import com.sun.sgs.impl.sharedutil.LoggerWrapper;
import com.sun.sgs.impl.sharedutil.PropertiesWrapper;
import com.sun.sgs.impl.util.Exporter;
import com.sun.sgs.impl.util.IoRunnable;
import com.sun.sgs.impl.util.NamedThreadFactory;
import com.sun.sgs.management.AffinityGroupFinderMXBean;
import com.sun.sgs.profile.ProfileCollector;
import com.sun.sgs.service.Node;
import com.sun.sgs.service.NodeListener;
import com.sun.sgs.service.WatchdogService;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.NavigableSet;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.management.JMException;
/**
* The server portion of the distributed label propagation algorithm.
* <p>
* The server is known to each node participating in the algorithm. It is
* responsible for preparing the nodes for a run of the algorithm, coordinating
* the iterations of the algorithm, and collecting and merging results from
* each node when finished.
*/
public class LabelPropagationServer extends BasicState
implements LPAAffinityGroupFinder, LPAServer
{
/** Our property base name. */
private static final String PROP_NAME =
"com.sun.sgs.impl.service.nodemap.affinity";
/** Our class name. */
private static final String CLASS_NAME =
LabelPropagationServer.class.getName();
/** Our logger. */
private static final LoggerWrapper logger =
new LoggerWrapper(Logger.getLogger(PROP_NAME));
/** The property name for the server port. */
public static final String SERVER_PORT_PROPERTY =
PROP_NAME + ".server.port";
/** The default value of the server port. */
public static final int DEFAULT_SERVER_PORT = 44537;
/** The name we export ourselves under. */
public static final String SERVER_EXPORT_NAME = "LabelPropagationServer";
/** The time, in minutes, to wait for all nodes to
* respond to asynchronous calls.
*/
private static final int TIMEOUT = 1; // minutes
/** The maximum number of iterations we will run. Interesting to set high
* for testing, but 5 has been shown to be adequate in most papers.
* For distributed case, seem to always converge within 10, and setting
* to 5 cuts off some of the highest modularity solutions (running
* distributed Zachary test network).
*/
private static final int MAX_ITERATIONS = 10;
/** Prefix for io task related properties. */
public static final String IO_TASK_PROPERTY_PREFIX =
"com.sun.sgs.impl.util.io.task";
/**
* An optional property that specifies the maximum number of retries for
* IO tasks in services.
*/
public static final String IO_TASK_RETRIES_PROPERTY =
IO_TASK_PROPERTY_PREFIX + ".max.retries";
/**
* An optional property that specifies the wait time between successive
* IO task retries.
*/
public static final String IO_TASK_WAIT_TIME_PROPERTY =
IO_TASK_PROPERTY_PREFIX + ".wait.time";
/** The default number of IO task retries. **/
static final int DEFAULT_MAX_IO_ATTEMPTS = 5;
/** The default time interval to wait between IO task retries. **/
static final int DEFAULT_RETRY_WAIT_TIME = 100;
/**
* Our local watchdog service, used in case of IO failures.
* Can be null for testing.
*/
private final WatchdogService wdog;
/** The time (in milliseconds) to wait between retries for IO
* operations. */
private final int retryWaitTime;
/** The maximum number of retry attempts for IO operations. */
private final int maxIoAttempts;
/** The exporter for this serve. */
private final Exporter<LPAServer> exporter;
/* A map from node id to client proxy objects. */
private final Map<Long, LPAClient> clientProxyMap =
new ConcurrentHashMap<Long, LPAClient>();
/** A barrier that consists of the set of nodes we expect to hear back
* from asynchronous calls. Once this set is empty, we can move on to the
* next step of the algorithm. This data structure is required, rather
* than a simple Barrier, because our calls must be idempotent.
*/
private final Set<Long> nodeBarrier =
Collections.synchronizedSet(new HashSet<Long>());
/**
* A latch to ensure our main thread waits for all nodes to complete
* each step of the algorithm before proceeding.
* This is replaced on each iteration.
* No synchronization is required on this latch.
*/
private volatile CountDownLatch latch;
// Algorithm iteration information
/**
* The current iteration of the algorithm, used for sanity checking;
* and set in a single thread.
*/
private int currentIteration;
/** True if we believe all nodes have converged. */
private volatile boolean nodesConverged;
/** Set to true if something has gone wrong and the results from
* this algorithm run should be ignored.
*/
private volatile boolean runFailed;
/**
* The exception to be associated with runFailed, including a detail
* message and causing exception (if there is one). During an algorithm
* iteration, this should be set once.
*/
private volatile AffinityGroupFinderFailedException runException;
/** A thread pool. Will create as many threads as needed, with a timeout
* of 60 sec before unused threads are reaped.
*/
private final ExecutorService executor = Executors.newCachedThreadPool(
new NamedThreadFactory("LabelPropagationServer"));
// TBD: we need to have a state, and not allow a run when we're shutting
// down, or shutdown while we're running. Will also need an
// enable/disable.
/** A lock to ensure we block a run of the algorithm if a current run
* is still going. TBD: what behavior do we want? Throw an exception?
* "merge" the two run attempts - e.g. second run just returns the result
* of the ongoing first one? Abort the first one?
*/
private final Object runningLock = new Object();
/** True if we're in the midst of an algorithm run. Access while holding
* the runningLock.
*/
private boolean running = false;
/** The algorithm run number, used to ensure that LPAClients are reporting
* results from the expected run.
*/
private final AtomicLong runNumber = new AtomicLong();
/** Our JMX info. */
private final AffinityGroupFinderStats stats;
/**
* Constructs a new label propagation server. Only one should exist
* within a Darkstar cluster.
* @param col the profile collector
* @param wdog the watchdog service, used for error reporting
* @param properties the application properties
* @throws IOException if an error occurs
*/
public LabelPropagationServer(ProfileCollector col, WatchdogService wdog,
Properties properties)
throws IOException
{
this.wdog = wdog;
PropertiesWrapper wrappedProps = new PropertiesWrapper(properties);
// Retry behavior
retryWaitTime = wrappedProps.getIntProperty(
IO_TASK_WAIT_TIME_PROPERTY, DEFAULT_RETRY_WAIT_TIME, 0,
Integer.MAX_VALUE);
maxIoAttempts = wrappedProps.getIntProperty(
IO_TASK_RETRIES_PROPERTY, DEFAULT_MAX_IO_ATTEMPTS, 0,
Integer.MAX_VALUE);
// Register our node listener with the watchdog service.
wdog.addNodeListener(new NodeFailListener());
int requestedPort = wrappedProps.getIntProperty(
SERVER_PORT_PROPERTY, DEFAULT_SERVER_PORT, 0, 65535);
// Export ourself.
exporter = new Exporter<LPAServer>(LPAServer.class);
exporter.export(this, SERVER_EXPORT_NAME, requestedPort);
// Create our JMX MBean
stats = new AffinityGroupFinderStats(this, col, MAX_ITERATIONS);
try {
col.registerMBean(stats, AffinityGroupFinderMXBean.MXBEAN_NAME);
} catch (JMException e) {
// Continue on if we couldn't register this bean, although
// it's probably a very bad sign
logger.logThrow(Level.CONFIG, e, "Could not register MBean");
}
}
// ---- Implement LPAAffinityGroupFinder --- //
/** {@inheritDoc} */
public NavigableSet<RelocatingAffinityGroup> findAffinityGroups()
throws AffinityGroupFinderFailedException
{
checkForDisabledOrShutdownState();
synchronized (runningLock) {
while (running) {
try {
runningLock.wait();
} catch (InterruptedException e) {
throw new AffinityGroupFinderFailedException(
"Interrupted while waiting for current run", e);
}
}
running = true;
}
// This server controls the running of the distributed label
// propagation algorithm, using the LPAServer and LPAClient
// interfaces. The protocol is:
// Server calls each LPAClient.prepareAlgorithm().
// Nodes contact other nodes which their graphs might be
// connected to, using LPAClient.notifyCrossNodeEdges.
// Nodes can find the appropriate LPAClient by calling
// LPAServer.getLPAClientProxy.
// When finished exchanging information, each node calls
// LPAServer.readyToBegin().
// Server begins iterations of the algorithm. For each iteration,
// it calls LPAClient.startIteration().
// Nodes compute one iteration of the label propagation algorithm.
// Remote information (cross node edges discovered above) can
// be found by calling LPAClient.getRemoteLabels on other nodes.
// When finished, each node calls LPAServer.finishedIteration,
// noting whether it believes the algorithm has converged.
// When all nodes agree that the algorithm has converged, or many
// iterations have been run, the server gathers all group information
// from each node by calling LPAClient.getAffinityGroups(). The server
// combines groups that might cross nodes, and creates new, final
// affinity group information.
long startTime = System.currentTimeMillis();
stats.runsCountInc();
// Don't pay any attention to changes while we're running, at least
// to start with. If a node fails, we'll stop and return no information
// for now. When a node fails, a lot of changes will occur in the
// graphs as we move identities from the failed node to new nodes.
//
// We make this copy unmodifiable to catch any errors in our code
// breaking this assumption.
final Map<Long, LPAClient> clientProxyCopy =
Collections.unmodifiableMap(
new HashMap<Long, LPAClient>(clientProxyMap));
runFailed = false;
runException = null;
nodesConverged = false;
// Tell each node to prepare for the algorithm to start
prepareAlgorithm(clientProxyCopy);
if (runFailed) {
handleFailure("could not prepare");
throw runException;
}
if (logger.isLoggable(Level.FINE)) {
long time = System.currentTimeMillis() - startTime;
logger.log(Level.FINE,
"Algorithm prepare took {0} milliseconds", time);
}
// Run the algorithm in multiple iterations, until it has runFailed
// or converged
runIterations(clientProxyCopy);
// Now, gather up our results
if (runFailed) {
handleFailure("could not complete iterations");
throw runException;
}
// If, after this point, we cannot contact a node, simply
// return the information that we have.
// Assuming a node has failed, we won't report the identities
// on the failed node as being part of any group.
NavigableSet<RelocatingAffinityGroup> retVal =
gatherFinalGroups(clientProxyCopy);
long runTime = System.currentTimeMillis() - startTime;
stats.runtimeSample(runTime);
stats.iterationsSample(currentIteration);
stats.setNumGroups(retVal.size());
if (logger.isLoggable(Level.FINE)) {
logger.log(Level.FINE, "Algorithm took {0} milliseconds and {1} " +
"iterations", runTime, currentIteration);
StringBuilder sb = new StringBuilder();
sb.append(" LPA found " + retVal.size() + " groups ");
for (AffinityGroup group : retVal) {
sb.append(" id: " + group.getId() + ": members ");
for (Identity id : group.getIdentities()) {
sb.append(id + " ");
}
}
logger.log(Level.FINE, sb.toString());
}
synchronized (runningLock) {
running = false;
runningLock.notifyAll();
}
return retVal;
}
/**
* Helper function while cleans up on failure and sets {@code runException}
* if it has not yet been set.
*/
private void handleFailure(String msg) {
synchronized (runningLock) {
running = false;
runningLock.notifyAll();
}
stats.failedCountInc();
stats.setNumGroups(0);
if (runException == null) {
runException = new AffinityGroupFinderFailedException(msg);
}
}
/** {@inheritDoc} */
public void disable() {
if (setDisabledState()) {
for (Map.Entry<Long, LPAClient> ce : clientProxyMap.entrySet()) {
runIoTask(new DisableTask(ce.getValue()),
wdog, ce.getKey(), maxIoAttempts,
retryWaitTime, CLASS_NAME);
}
}
}
/** Private task to disable a proxy. */
private static class DisableTask implements IoRunnable {
private final LPAClient proxy;
DisableTask(LPAClient proxy) {
this.proxy = proxy;
}
public void run() throws IOException {
proxy.disable();
}
}
/** {@inheritDoc} */
public void enable() {
if (setEnabledState()) {
for (Map.Entry<Long, LPAClient> ce : clientProxyMap.entrySet()) {
runIoTask(new EnableTask(ce.getValue()),
wdog, ce.getKey(), maxIoAttempts,
retryWaitTime, CLASS_NAME);
}
}
}
/** Private task to enable a proxy. */
private static class EnableTask implements IoRunnable {
private final LPAClient proxy;
EnableTask(LPAClient proxy) {
this.proxy = proxy;
}
public void run() throws IOException {
proxy.enable();
}
}
/** {@inheritDoc} */
public void shutdown() {
if (setShutdownState()) {
for (Map.Entry<Long, LPAClient> ce : clientProxyMap.entrySet()) {
try {
ce.getValue().shutdown();
clientProxyMap.remove(ce.getKey());
} catch (IOException e) {
// It's OK if we cannot reach the client. The entire system
// might be coming down.
}
}
exporter.unexport();
executor.shutdownNow();
}
}
// --- Implement LPAServer --- //
/** {@inheritDoc} */
public void readyToBegin(long nodeId, boolean failed) throws IOException {
if (failed) {
String msg = "node " + nodeId + " reports failure preparing";
if (runException == null) {
runException = new AffinityGroupFinderFailedException(msg);
}
logger.log(Level.INFO, "node {0} reports failure", nodeId);
runFailed = true;
}
maybeCountDown(nodeId);
}
/** {@inheritDoc} */
public void finishedIteration(long nodeId, boolean converged,
boolean failed, int iteration)
throws IOException
{
if (failed) {
String msg = "node " + nodeId +
" reports failure in iteration " + iteration;
if (runException == null) {
runException = new AffinityGroupFinderFailedException(msg);
}
logger.log(Level.INFO, "node {0} reports failure", nodeId);
runFailed = true;
}
if (iteration != currentIteration) {
String msg = "node " + nodeId +
" reports unexpected iteration " + iteration;
if (runException == null) {
runException = new AffinityGroupFinderFailedException(msg);
}
logger.log(Level.INFO, "unexpected iteration: {0} on node {1}, " +
"expected {2}, marking run failed",
iteration, nodeId, currentIteration);
runFailed = true;
}
nodesConverged = converged && nodesConverged;
maybeCountDown(nodeId);
}
/** {@inheritDoc} */
public LPAClient getLPAClientProxy(long nodeId) throws IOException {
return clientProxyMap.get(nodeId);
}
/** {@inheritDoc} */
public void register(long nodeId, LPAClient client) throws IOException {
clientProxyMap.put(nodeId, client);
}
/**
* The listener registered with the watchdog service. These methods
* will be notified if a node starts or stops.
*/
private class NodeFailListener implements NodeListener {
NodeFailListener() {
// nothing special
}
/** {@inheritDoc} */
public void nodeHealthUpdate(Node node) {
switch (node.getHealth()) {
case RED :
removeNode(node.getId());
break;
default :
// do nothing
break;
}
}
}
/**
* Removes cached information about a failed node.
* @param nodeId the Id of the failed node
*/
private void removeNode(long nodeId) {
clientProxyMap.remove(nodeId);
}
/**
* Tells each registred LPAClient to prepare for a run of the algorithm.
*
* @param clientProxies a map of node ids to LPAClient proxies
*/
private void prepareAlgorithm(Map<Long, LPAClient> clientProxies) {
// Tell each node to prepare for an algorithm run.
nodeBarrier.clear();
nodeBarrier.addAll(clientProxies.keySet());
latch = new CountDownLatch(clientProxies.keySet().size());
final long runNum = runNumber.incrementAndGet();
for (final Map.Entry<Long, LPAClient> ce : clientProxies.entrySet()) {
long nodeId = ce.getKey();
try {
boolean ok = runIoTask(new IoRunnable() {
public void run() throws IOException {
ce.getValue().prepareAlgorithm(runNum);
} }, wdog, nodeId, maxIoAttempts, retryWaitTime,
CLASS_NAME);
if (!ok) {
String msg = "node " + nodeId +
" could not be contacted to prepare " + runNum;
if (runException == null) {
runException =
new AffinityGroupFinderFailedException(msg);
}
runFailed = true;
maybeCountDown(nodeId);
// If we cannot reach the proxy after retries, we need
// to remove it from the
// clientProxyMap.
removeNode(nodeId);
}
} catch (Exception e) {
String msg = "node " + nodeId +
" exception while preparing " + runNum;
if (runException == null) {
runException =
new AffinityGroupFinderFailedException(msg, e);
}
logger.logThrow(Level.INFO, e,
"exception from node {0} while preparing",
nodeId);
runFailed = true;
maybeCountDown(nodeId);
}
}
// Wait for the initialization to complete on all nodes
waitOnLatch();
}
/**
* Run the algorithm iterations until all LPAClients have converged.
* @param clientProxies a map of node ids to LPAClient proxies
*/
private void runIterations(Map<Long, LPAClient> clientProxies) {
final int cleanSize = clientProxies.keySet().size();
currentIteration = 0;
while (!runFailed && !nodesConverged) {
// Assume we'll converge unless told otherwise; all nodes must
// say we've converged for nodesConverged to remain true in
// this iteration
nodesConverged = true;
assert (nodeBarrier.isEmpty());
nodeBarrier.addAll(clientProxies.keySet());
latch = new CountDownLatch(cleanSize);
for (final Map.Entry<Long, LPAClient> ce : clientProxies.entrySet())
{
long nodeId = ce.getKey();
try {
boolean ok = runIoTask(new IoRunnable() {
public void run() throws IOException {
ce.getValue().startIteration(currentIteration);
} }, wdog, nodeId, maxIoAttempts, retryWaitTime,
CLASS_NAME);
if (!ok) {
String msg = "node " + nodeId +
" could not be contacted for iteration " +
currentIteration;
if (runException == null) {
runException =
new AffinityGroupFinderFailedException(msg);
}
runFailed = true;
maybeCountDown(nodeId);
// If we cannot reach the proxy after retries, we need
// to remove it from the
// clientProxyMap.
removeNode(nodeId);
}
} catch (Exception e) {
String msg = "node " + nodeId +
" exception for iteration " +
currentIteration;
if (runException == null) {
runException =
new AffinityGroupFinderFailedException(msg, e);
}
logger.logThrow(Level.INFO, e,
"exception from node {0} while running " +
"iteration {1}",
nodeId, currentIteration);
runFailed = true;
maybeCountDown(nodeId);
}
}
// Wait for all nodes to complete this iteration
waitOnLatch();
// Papers show most work is done after 5 iterations
if (++currentIteration >= MAX_ITERATIONS) {
stats.stoppedCountInc();
logger.log(Level.FINE, "exceeded {0} iterations, stopping",
MAX_ITERATIONS);
break;
}
}
}
/**
* Ask each of the LPAClients for the final affinity groups they found,
* also asking them to prepare for the next algorithm run. The affinity
* groups are then merged, so the final groups can span nodes.
*
* @param clientProxies a map of node ids to LPAClient proxies
* @return the merged affinity groups found on each LPAClient
*/
private NavigableSet<RelocatingAffinityGroup> gatherFinalGroups(
Map<Long, LPAClient> clientProxies)
{
// If, after this point, we cannot contact a node, simply
// return the information that we have.
// Assuming a node has failed, we won't report the identities
// on the failed node as being part of any group.
final Map<Long, Set<AffinityGroup>> returnedGroups =
new ConcurrentHashMap<Long, Set<AffinityGroup>>();
nodeBarrier.clear();
nodeBarrier.addAll(clientProxies.keySet());
latch = new CountDownLatch(clientProxies.keySet().size());
final long runNum = runNumber.get();
for (final Map.Entry<Long, LPAClient> ce : clientProxies.entrySet()) {
final Long nodeId = ce.getKey();
final LPAClient proxy = ce.getValue();
// TODO: use executor to make parallel requests
executor.execute(new Runnable() {
public void run() {
try {
boolean ok = runIoTask(new IoRunnable() {
public void run() throws IOException {
returnedGroups.put(nodeId,
proxy.getAffinityGroups(runNum, true));
} }, wdog, nodeId, maxIoAttempts, retryWaitTime,
CLASS_NAME);
maybeCountDown(nodeId);
if (!ok) {
removeNode(ce.getKey());
}
} catch (Exception e) {
logger.logThrow(Level.INFO, e,
"exception from node {0} while returning groups",
ce.getKey());
maybeCountDown(nodeId);
}
}
});
}
// Wait for the calls to complete on all nodes
waitOnLatch();
// Map of group id -> identity, node
Map<Long, Map<Identity, Long>> groupMap =
new HashMap<Long, Map<Identity, Long>>();
// Ensure that each identity is only assigned to a single group
Set<Identity> idSet = new HashSet<Identity>();
for (Map.Entry<Long, Set<AffinityGroup>> e :
returnedGroups.entrySet())
{
Long nodeId = e.getKey();
for (AffinityGroup ag : e.getValue()) {
long id = ag.getId();
Map<Identity, Long> idNodeMap = groupMap.get(id);
if (idNodeMap == null) {
idNodeMap = new HashMap<Identity, Long>();
groupMap.put(id, idNodeMap);
}
for (Identity gid : ag.getIdentities()) {
if (idSet.add(gid)) {
// Only add if this is the first time we've seen
// the identity. The group selected is the first
// one seen, as added to the returnedGroups from
// the proxy calls.
idNodeMap.put(gid, nodeId);
}
}
}
}
// Create our final return values
NavigableSet<RelocatingAffinityGroup> retVal =
new TreeSet<RelocatingAffinityGroup>();
for (Map.Entry<Long, Map<Identity, Long>> e : groupMap.entrySet()) {
retVal.add(new RelocatingAffinityGroup(e.getKey(),
e.getValue(),
runNum));
}
return retVal;
}
/**
* Wait on the global latch, noting if the wait was not successful.
*/
private void waitOnLatch() {
try {
boolean ok = latch.await(TIMEOUT, TimeUnit.MINUTES);
if (!ok) {
// We timed out on the latch, invalidating this run.
String msg = "Latch timed out";
if (runException == null) {
runException = new AffinityGroupFinderFailedException(msg);
}
runFailed = true;
}
} catch (InterruptedException ex) {
String msg = "Latch timed interrupted";
if (runException == null) {
runException = new AffinityGroupFinderFailedException(msg, ex);
}
runFailed = true;
}
}
/**
* Calls countDown on {@code latch} if the given node ID is in
* the {@code nodeBarrier}.
*
* @param nodeId the ID of the node we're accounting for
*/
private void maybeCountDown(long nodeId) {
if (nodeBarrier.remove(nodeId)) {
latch.countDown();
}
}
/**
* Executes the specified {@code ioTask} by invoking its {@link
* IoRunnable#run run} method. If the specified task throws an
* {@code IOException}, this method will retry the task for a fixed
* number of times. The number of retries and the wait time between
* retries are configurable properties.
* <p>
* This is much the same as the like method in AbstractService, except
* we don't bother to check for a transactional context (we won't be in
* one).
*
* @param ioTask a task with IO-related operations
* @param wdog the watchdog service for the local node, in case of failure
* @param nodeId the node that should be shut down in case of failure
* @param maxTries the number of times to attempt the retry
* @param waitTime the amount of time to wait before retry
* @param name name of caller, in case of failure
*
* @return {@code true} if the ioTask ran successfully
*/
static boolean runIoTask(IoRunnable ioTask, WatchdogService wdog,
long nodeId, int maxTries, int waitTime,
String name)
{
int maxAttempts = maxTries;
while (maxAttempts > 0) {
try {
ioTask.run();
return true;
} catch (IOException e) {
if (logger.isLoggable(Level.FINEST)) {
logger.logThrow(Level.FINEST, e,
"IoRunnable {0} throws", ioTask);
}
try {
// TBD: what back-off policy do we want here?
Thread.sleep(waitTime);
} catch (InterruptedException ie) {
}
}
}
logger.log(Level.WARNING,
"A communication error occured while running an" +
"IO task. Could not reach node {0}.", nodeId);
wdog.reportFailure(nodeId, name);
return false;
}
}