// at the tail of the queue
//todo this polling will not work with multiple usernames but with single user
// and multiple hosts, currently monitoring will work
UserMonitorData take = null;
JobStatusChangeRequest jobStatus = new JobStatusChangeRequest();
MonitorID currentMonitorID = null;
HostDescription currentHostDescription = null;
try {
take = this.queue.take();
List<MonitorID> completedJobs = new ArrayList<MonitorID>();
List<HostMonitorData> hostMonitorData = take.getHostMonitorData();
for (HostMonitorData iHostMonitorData : hostMonitorData) {
if (iHostMonitorData.getHost().getType() instanceof GsisshHostType
|| iHostMonitorData.getHost().getType() instanceof SSHHostType) {
currentHostDescription = iHostMonitorData.getHost();
String hostName = iHostMonitorData.getHost().getType().getHostAddress();
ResourceConnection connection = null;
if (connections.containsKey(hostName)) {
logger.debug("We already have this connection so not going to create one");
connection = connections.get(hostName);
} else {
connection = new ResourceConnection(iHostMonitorData,getAuthenticationInfo());
connections.put(hostName, connection);
}
List<MonitorID> monitorID = iHostMonitorData.getMonitorIDs();
Map<String, JobState> jobStatuses = connection.getJobStatuses(monitorID);
for (MonitorID iMonitorID : monitorID) {
currentMonitorID = iMonitorID;
iMonitorID.setStatus(jobStatuses.get(iMonitorID.getJobID())); //IMPORTANT this is not a simple setter we have a logic
jobStatus = new JobStatusChangeRequest(iMonitorID);
// we have this JobStatus class to handle amqp monitoring
publisher.publish(jobStatus);
// if the job is completed we do not have to put the job to the queue again
iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime()));
// After successful monitoring perform following actions to cleanup the queue, if necessary
if (jobStatus.getState().equals(JobState.COMPLETE)) {
completedJobs.add(iMonitorID);
try {
gfac.invokeOutFlowHandlers(iMonitorID.getJobExecutionContext());
} catch (GFacException e) {
publisher.publish(new TaskStatusChangeRequest(new TaskIdentity(iMonitorID.getExperimentID(), iMonitorID.getWorkflowNodeID(),
iMonitorID.getTaskID()), TaskState.FAILED));
publisher.publish(new ExperimentStatusChangeRequest(new ExperimentIdentity(iMonitorID.getExperimentID()),
ExperimentState.FAILED));
logger.info(e.getLocalizedMessage(), e);
}
} else if (iMonitorID.getFailedCount() > 2) {
logger.error("Tried to monitor the job with ID " + iMonitorID.getJobID() + " But failed 3 times, so skip this Job from Monitor");
iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime()));
completedJobs.add(iMonitorID);
try {
logger.error("Launching outflow handlers to check output are genereated or not");
gfac.invokeOutFlowHandlers(iMonitorID.getJobExecutionContext());
} catch (GFacException e) {
publisher.publish(new TaskStatusChangeRequest(new TaskIdentity(iMonitorID.getExperimentID(), iMonitorID.getWorkflowNodeID(),
iMonitorID.getTaskID()), TaskState.FAILED));
publisher.publish(new ExperimentStatusChangeRequest(new ExperimentIdentity(iMonitorID.getExperimentID()),
ExperimentState.FAILED));
logger.info(e.getLocalizedMessage(), e);
}
} else {
// Evey
iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime()));
// if the job is complete we remove it from the Map, if any of these maps
// get empty this userMonitorData will get delete from the queue
}
}
} else {
logger.debug("Qstat Monitor doesn't handle non-gsissh hosts");
}
}
// We have finished all the HostMonitorData object in userMonitorData, now we need to put it back
// now the userMonitorData goes back to the tail of the queue
queue.put(take);
// cleaning up the completed jobs, this method will remove some of the userMonitorData from the queue if
// they become empty
for (MonitorID completedJob : completedJobs) {
CommonUtils.removeMonitorFromQueue(queue, completedJob);
}
} catch (InterruptedException e) {
if (!this.queue.contains(take)) {
try {
this.queue.put(take);
} catch (InterruptedException e1) {
e1.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
logger.error("Error handling the job with Job ID:" + currentMonitorID.getJobID());
throw new AiravataMonitorException(e);
} catch (SSHApiException e) {
logger.error(e.getMessage());
if (e.getMessage().contains("Unknown Job Id Error")) {
// in this case job is finished or may be the given job ID is wrong
jobStatus.setState(JobState.UNKNOWN);
publisher.publish(jobStatus);
} else if (e.getMessage().contains("illegally formed job identifier")) {
logger.error("Wrong job ID is given so dropping the job from monitoring system");
} else if (!this.queue.contains(take)) { // we put the job back to the queue only if its state is not unknown
if (currentMonitorID == null) {
logger.error("Monitoring the jobs failed, for user: " + take.getUserName()
+ " in Host: " + currentHostDescription.getType().getHostAddress());
} else {
if (currentMonitorID != null) {
if (currentMonitorID.getFailedCount() < 2) {
try {
currentMonitorID.setFailedCount(currentMonitorID.getFailedCount() + 1);
this.queue.put(take);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
} else {
logger.error(e.getMessage());
logger.error("Tried to monitor the job 3 times, so dropping of the the Job with ID: " + currentMonitorID.getJobID());
}
}
}
}
throw new AiravataMonitorException("Error retrieving the job status", e);
} catch (Exception e) {
if (currentMonitorID != null) {
if (currentMonitorID.getFailedCount() < 3) {
try {
currentMonitorID.setFailedCount(currentMonitorID.getFailedCount() + 1);
this.queue.put(take);
// if we get a wrong status we wait for a while and request again
Thread.sleep(10000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
} else {
logger.error(e.getMessage());
logger.error("Tryied to monitor the job 3 times, so dropping of the the Job with ID: " + currentMonitorID.getJobID());
}
}
throw new AiravataMonitorException("Error retrieving the job status", e);
}