package edu.purdue.wind.rtsj;
import javax.realtime.RealtimeThread;
import javax.realtime.PriorityParameters;
import javax.realtime.PeriodicParameters;
import javax.realtime.ReleaseParameters;
import edu.purdue.wind.Log;
/**
* This watchdog implementation collects checkins from live (and
* ostensibly correctly-performing) processes on a periodic basis, and
* restarts subsystems that fail to check in for a configurable number
* of periods. It is capable of managing up to 32 processes, identified
* by an integer assigned at registration time.
*/
public class Watchdog {
private static Object instanceLock = new Object();
private static Watchdog instance;
/** Protects {@code active} and {@code alive} */
private Object checkinLock;
/** Services currently being monitored by the watchdog */
private int active;
/** Services that have checked in since the last period */
private int alive;
/** Protects {@code registered}, {@code periods}, {@code missed},
* {@code handlers} */
private Object configLock;
/** The number of currently-registered services */
private int registered;
/** Each entry in this array is the number of periods that may pass
* before its corresponding service is overdue */
private int periods[];
/** The number of periods since each service's most recent checkin */
private int missed[];
/** Reset handlers for each registered service */
private WatchdogResetHandler handlers[];
/**
* A callback interface defining a watchdog reset handler to be
* invoked if a service fails to check in to the watchdog for its
* specified number of grace periods.
*/
public interface WatchdogResetHandler {
/**
* This method will be invoked if the service fails to check in
* to the watchdog for its allowed number of grace periods. It
* should restart the failed service or take whatever other
* actions are necessary to restore the system to a working
* state.
*/
public void reset();
}
/**
* Start the {@code Watchdog} service, creating an instance of the
* watchdog.
*
* @throws IllegalStateException if the watchdog is already started.
*/
static void start() {
synchronized (instanceLock) {
if (instance != null) {
throw new IllegalStateException("Watchdog already started");
}
final Watchdog w = new Watchdog();
PriorityParameters pp = new PriorityParameters(Configuration.WATCHDOG_PRIORITY);
PeriodicParameters periodic = new PeriodicParameters(Configuration.WATCHDOG_PERIOD);
RealtimeThread t = new RealtimeThread(new Runnable() {
public void run() {
w.run();
}
});
t.setSchedulingParameters(pp);
t.setReleaseParameters((ReleaseParameters)periodic);
t.start();
while (instance == null) {
try {
instanceLock.wait();
} catch (Exception e) {
continue;
}
}
}
}
/**
* Retrieve the running {@code Watchdog}.
*
* @return the running {@code Watchdog} if started, else {@code null}.
*/
public static Watchdog instance() {
synchronized (instanceLock) {
return instance;
}
}
/**
* Construct the {@code Watchdog} singleton.
*/
private Watchdog() {
checkinLock = new Object();
configLock = new Object();
periods = new int[32];
missed = new int[32];
handlers = new WatchdogResetHandler[32];
}
/**
* The actual watchdog periodic service routine. This class is not
* a {@code Runnable} so that this routine can be private.
*/
private void run() {
synchronized (instanceLock) {
instance = this;
instanceLock.notify();
}
for (;;) {
int nowActive;
int nowAlive;
((RealtimeThread)(Thread.currentThread())).waitForNextPeriod();
synchronized (checkinLock) {
nowActive = active;
nowAlive = alive;
alive = 0;
}
if (nowActive != nowAlive) {
int misses = nowActive & ~nowAlive;
synchronized (configLock) {
final int services = registered;
for (int i = 0; i < services; i++) {
int bit = 1 << i;
if ((misses & bit) == 0) {
missed[i] = 0;
} else {
missed[i] += 1;
if (missed[i] > periods[i]) {
if (handlers[i] == null) {
// FIXME: Do something here, too
Log.instance().log(Log.LogLevel.WARNING, "Watchdog",
"No handler for service requiring reset");
continue;
}
try {
handlers[i].reset();
} catch (Exception e) {
// Nothing; the handler really
// shouldn't have done that.
}
// FIXME: Repeated failures -> reboot?
}
}
}
}
}
}
}
/**
* Configure a service in the watchdog, setting the number of
* periods that may pass without a checkin before it is considered
* failed, and the handler to call when it fails.
*
* @param periods the number of periods permitted between checkins.
* @param wrh callback to be invoked if the specified number of
* periods expires without checkings while the service is
* active
*
* @return service The service number assigned to this registration
* (an opaque value).
*/
public int register(int periods, WatchdogResetHandler wrh) {
int service;
synchronized (configLock) {
if (registered == 32) {
throw new RuntimeException("All watchdog entries have been exhausted");
}
service = registered++;
this.periods[service] = periods;
}
return service;
}
/**
* Activate a registered service, so that it will be consulted each
* watchdog period. Services that are activated will have their
* checkins monitored, and their reset handlers called if they fail
* to check in.
*
* @param service the service index as returned by
* {@link #register(int, WatchdogResetHandler)}.
*/
public void activate(int service) {
// FIXME: Should these be nested the other way? This should
// really only happen at startup and shutdown...
synchronized (configLock) {
synchronized (checkinLock) {
active |= (1 << service);
missed[service] = 0;
}
}
}
/**
* Deactivate a registered service, so that it is ignored when the
* watchdog period expires. Deactivated services will never be
* reset regardless of failure to checkin.
*
* @param service the service index as returned by
* {@link #register(int, WatchdogResetHandler)}.
*/
public void deactivate(int service) {
synchronized (checkinLock) {
active &= ~(1 << service);
}
}
/**
* Check a service in with the watchdog.
*
* @param service the service index as returned by
* {@link #register(int, WatchdogResetHandler)}.
*/
public void checkin(int service) {
synchronized (checkinLock) {
alive |= (1 << service);
}
}
}