@Override
public void signalFatalError(TezTaskAttemptID taskAttemptID,
String diagnostics,
EventMetaData sourceInfo) {
TezEvent taskAttemptFailedEvent =
new TezEvent(new TaskAttemptFailedEvent(diagnostics),
sourceInfo);
try {
heartbeat(Collections.singletonList(taskAttemptFailedEvent));
} catch (Throwable t) {
LOG.fatal("Failed to communicate task attempt failure to AM via"
+ " umbilical", t);
// FIXME NEWTEZ maybe send a container failed event to AM?
// Irrecoverable error unless heartbeat sync can be re-established
heartbeatErrorException = t;
heartbeatError.set(true);
heartbeatThread.interrupt();
}
}
@Override
public boolean canCommit(TezTaskAttemptID taskAttemptID)
throws IOException {
return umbilical.canCommit(taskAttemptID);
}
};
// report non-pid to application master
String pid = System.getenv().get("JVM_PID");
LOG.info("PID, containerIdentifier: " + pid + ", " + containerIdentifier);
ContainerTask containerTask = null;
UserGroupInformation childUGI = null;
ContainerContext containerContext = new ContainerContext(
containerIdentifier, pid);
int getTaskMaxSleepTime = defaultConf.getInt(
TezConfiguration.TEZ_TASK_GET_TASK_SLEEP_INTERVAL_MS_MAX,
TezConfiguration.TEZ_TASK_GET_TASK_SLEEP_INTERVAL_MS_MAX_DEFAULT);
int taskCount = 0;
TezVertexID lastVertexId = null;
EventMetaData currentSourceInfo = null;
try {
while (true) {
// poll for new task
if (taskCount > 0) {
updateLoggers(null);
}
boolean isNewGetTask = true;
long getTaskPollStartTime = System.currentTimeMillis();
long nextGetTaskPrintTime = getTaskPollStartTime + 2000l;
for (int idle = 0; null == containerTask; ++idle) {
if (!isNewGetTask) { // Don't sleep on the first iteration.
long sleepTimeMilliSecs = Math.min(idle * 10, getTaskMaxSleepTime);
if (sleepTimeMilliSecs + System.currentTimeMillis() > nextGetTaskPrintTime) {
LOG.info("Sleeping for "
+ sleepTimeMilliSecs
+ "ms before retrying getTask again. Got null now. "
+ "Next getTask sleep message after 2s");
nextGetTaskPrintTime = System.currentTimeMillis() + sleepTimeMilliSecs + 2000l;
}
MILLISECONDS.sleep(sleepTimeMilliSecs);
} else {
LOG.info("Attempting to fetch new task");
}
isNewGetTask = false;
containerTask = umbilical.getTask(containerContext);
}
LOG.info("Got TaskUpdate: "
+ (System.currentTimeMillis() - getTaskPollStartTime)
+ " ms after starting to poll."
+ " TaskInfo: shouldDie: " + containerTask.shouldDie()
+ (containerTask.shouldDie() == true ? "" : ", currentTaskAttemptId: "
+ containerTask.getTaskSpec().getTaskAttemptID()));
if (containerTask.shouldDie()) {
return;
}
taskCount++;
final TaskSpec taskSpec = containerTask.getTaskSpec();
if (LOG.isDebugEnabled()) {
LOG.debug("New container task context:"
+ taskSpec.toString());
}
try {
taskLock.writeLock().lock();
currentTaskAttemptID = taskSpec.getTaskAttemptID();
TezVertexID newVertexId =
currentTaskAttemptID.getTaskID().getVertexID();
if (lastVertexId != null) {
if (!lastVertexId.equals(newVertexId)) {
objectRegistry.clearCache(ObjectLifeCycle.VERTEX);
}
if (!lastVertexId.getDAGId().equals(newVertexId.getDAGId())) {
objectRegistry.clearCache(ObjectLifeCycle.DAG);
}
}
lastVertexId = newVertexId;
updateLoggers(currentTaskAttemptID);
currentTask = createLogicalTask(attemptNumber, taskSpec,
defaultConf, tezUmbilical, serviceConsumerMetadata);
} finally {
taskLock.writeLock().unlock();
}
final EventMetaData sourceInfo = new EventMetaData(
EventProducerConsumerType.SYSTEM,
taskSpec.getVertexName(), "", currentTaskAttemptID);
currentSourceInfo = sourceInfo;
// TODO Initiate Java VM metrics
// JvmMetrics.initSingleton(containerId.toString(), job.getSessionId());
childUGI = UserGroupInformation.createRemoteUser(System
.getenv(ApplicationConstants.Environment.USER.toString()));
// Add tokens to new user so that it may execute its task correctly.
childUGI.addCredentials(credentials);
childUGI.doAs(new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
try {
LOG.info("Initializing task"
+ ", taskAttemptId=" + currentTaskAttemptID);
currentTask.initialize();
if (!currentTask.hadFatalError()) {
LOG.info("Running task"
+ ", taskAttemptId=" + currentTaskAttemptID);
currentTask.run();
LOG.info("Closing task"
+ ", taskAttemptId=" + currentTaskAttemptID);
currentTask.close();
}
LOG.info("Task completed"
+ ", taskAttemptId=" + currentTaskAttemptID
+ ", fatalErrorOccurred=" + currentTask.hadFatalError());
if (!currentTask.hadFatalError()) {
TezEvent statusUpdateEvent =
new TezEvent(new TaskStatusUpdateEvent(
currentTask.getCounters(), currentTask.getProgress()),
new EventMetaData(EventProducerConsumerType.SYSTEM,
currentTask.getVertexName(), "",
currentTask.getTaskAttemptID()));
TezEvent taskCompletedEvent =
new TezEvent(new TaskAttemptCompletedEvent(), sourceInfo);
heartbeat(Arrays.asList(statusUpdateEvent, taskCompletedEvent));
}
} finally {
currentTask.cleanup();
}
try {
taskLock.writeLock().lock();
currentTask = null;
currentTaskAttemptID = null;
} finally {
taskLock.writeLock().unlock();
}
return null;
}
});
FileSystem.closeAllForUGI(childUGI);
containerTask = null;
if (heartbeatError.get()) {
LOG.fatal("Breaking out of task loop, heartbeat error occurred",
heartbeatErrorException);
break;
}
}
} catch (FSError e) {
LOG.fatal("FSError from child", e);
// TODO NEWTEZ this should be a container failed event?
try {
taskLock.readLock().lock();
if (currentTask != null && !currentTask.hadFatalError()) {
// Prevent dup failure events
currentTask.setFatalError(e, "FS Error in Child JVM");
TezEvent taskAttemptFailedEvent =
new TezEvent(new TaskAttemptFailedEvent(
StringUtils.stringifyException(e)),
currentSourceInfo);
heartbeat(Collections.singletonList(taskAttemptFailedEvent));
}
} finally {
taskLock.readLock().unlock();
}
} catch (Throwable throwable) {
String cause = StringUtils.stringifyException(throwable);
LOG.fatal("Error running child : " + cause);
taskLock.readLock().lock();
try {
if (currentTask != null && !currentTask.hadFatalError()) {
// Prevent dup failure events
currentTask.setFatalError(throwable, "Error in Child JVM");
TezEvent taskAttemptFailedEvent =
new TezEvent(new TaskAttemptFailedEvent(cause),
currentSourceInfo);
heartbeat(Collections.singletonList(taskAttemptFailedEvent));
}
} finally {
taskLock.readLock().unlock();