}
// FIXME fix initialize metrics in child runner
DefaultMetricsSystem.initialize("VertexTask");
YarnTezDagChild.containerIdStr = containerIdentifier;
ObjectRegistryImpl objectRegistry = new ObjectRegistryImpl();
@SuppressWarnings("unused")
Injector injector = Guice.createInjector(
new ObjectRegistryModule(objectRegistry));
// Security framework already loaded the tokens into current ugi
Credentials credentials =
UserGroupInformation.getCurrentUser().getCredentials();
if (LOG.isDebugEnabled()) {
LOG.debug("Executing with tokens:");
for (Token<?> token : credentials.getAllTokens()) {
LOG.debug(token);
}
}
amPollInterval = defaultConf.getLong(
TezConfiguration.TEZ_TASK_AM_HEARTBEAT_INTERVAL_MS,
TezConfiguration.TEZ_TASK_AM_HEARTBEAT_INTERVAL_MS_DEFAULT);
maxEventsToGet = defaultConf.getInt(
TezConfiguration.TEZ_TASK_MAX_EVENTS_PER_HEARTBEAT,
TezConfiguration.TEZ_TASK_MAX_EVENTS_PER_HEARTBEAT_DEFAULT);
// Create TaskUmbilicalProtocol as actual task owner.
UserGroupInformation taskOwner =
UserGroupInformation.createRemoteUser(tokenIdentifier);
Token<JobTokenIdentifier> jobToken = TokenCache.getSessionToken(credentials);
SecurityUtil.setTokenService(jobToken, address);
taskOwner.addToken(jobToken);
// Will jobToken change across DAGs ?
Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
serviceConsumerMetadata.put(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID,
ShuffleUtils.convertJobTokenToBytes(jobToken));
umbilical =
taskOwner.doAs(new PrivilegedExceptionAction<TezTaskUmbilicalProtocol>() {
@Override
public TezTaskUmbilicalProtocol run() throws Exception {
return (TezTaskUmbilicalProtocol)RPC.getProxy(TezTaskUmbilicalProtocol.class,
TezTaskUmbilicalProtocol.versionID, address, defaultConf);
}
});
final Thread heartbeatThread = startHeartbeatThread();
TezUmbilical tezUmbilical = new TezUmbilical() {
@Override
public void addEvents(Collection<TezEvent> events) {
eventsToSend.addAll(events);
}
@Override
public void signalFatalError(TezTaskAttemptID taskAttemptID,
String diagnostics,
EventMetaData sourceInfo) {
currentTask.setFrameworkCounters();
TezEvent statusUpdateEvent =
new TezEvent(new TaskStatusUpdateEvent(
currentTask.getCounters(), currentTask.getProgress()),
new EventMetaData(EventProducerConsumerType.SYSTEM,
currentTask.getVertexName(), "",
currentTask.getTaskAttemptID()));
TezEvent taskAttemptFailedEvent =
new TezEvent(new TaskAttemptFailedEvent(diagnostics),
sourceInfo);
try {
// Not setting taskComplete - since the main loop responsible for cleanup doesn't have
// control yet. Getting control depends on whether the I/P/O returns correctly after
// reporting an error.
heartbeat(Lists.newArrayList(statusUpdateEvent, taskAttemptFailedEvent));
} catch (Throwable t) {
LOG.fatal("Failed to communicate task attempt failure to AM via"
+ " umbilical", t);
if (t instanceof Error) {
LOG.error("Exception of type Error. Exiting now", t);
ExitUtil.terminate(-1, t);
}
// FIXME NEWTEZ maybe send a container failed event to AM?
// Irrecoverable error unless heartbeat sync can be re-established
heartbeatErrorException = t;
heartbeatError.set(true);
heartbeatThread.interrupt();
}
}
@Override
public boolean canCommit(TezTaskAttemptID taskAttemptID)
throws IOException {
return umbilical.canCommit(taskAttemptID);
}
};
// report non-pid to application master
String pid = System.getenv().get("JVM_PID");
LOG.info("PID, containerIdentifier: " + pid + ", " + containerIdentifier);
ContainerTask containerTask = null;
UserGroupInformation childUGI = null;
ContainerContext containerContext = new ContainerContext(
containerIdentifier, pid);
int getTaskMaxSleepTime = defaultConf.getInt(
TezConfiguration.TEZ_TASK_GET_TASK_SLEEP_INTERVAL_MS_MAX,
TezConfiguration.TEZ_TASK_GET_TASK_SLEEP_INTERVAL_MS_MAX_DEFAULT);
int taskCount = 0;
TezVertexID lastVertexId = null;
EventMetaData currentSourceInfo = null;
try {
String loggerAddend = "";
while (true) {
// poll for new task
if (taskCount > 0) {
TezUtils.updateLoggers(loggerAddend);
}
boolean isNewGetTask = true;
long getTaskPollStartTime = System.currentTimeMillis();
long nextGetTaskPrintTime = getTaskPollStartTime + 2000l;
for (int idle = 0; null == containerTask; ++idle) {
if (!isNewGetTask) { // Don't sleep on the first iteration.
long sleepTimeMilliSecs = Math.min(idle * 10, getTaskMaxSleepTime);
if (sleepTimeMilliSecs + System.currentTimeMillis() > nextGetTaskPrintTime) {
LOG.info("Sleeping for "
+ sleepTimeMilliSecs
+ "ms before retrying getTask again. Got null now. "
+ "Next getTask sleep message after 2s");
nextGetTaskPrintTime = System.currentTimeMillis() + sleepTimeMilliSecs + 2000l;
}
MILLISECONDS.sleep(sleepTimeMilliSecs);
} else {
LOG.info("Attempting to fetch new task");
}
isNewGetTask = false;
containerTask = umbilical.getTask(containerContext);
}
LOG.info("Got TaskUpdate: "
+ (System.currentTimeMillis() - getTaskPollStartTime)
+ " ms after starting to poll."
+ " TaskInfo: shouldDie: " + containerTask.shouldDie()
+ (containerTask.shouldDie() == true ? "" : ", currentTaskAttemptId: "
+ containerTask.getTaskSpec().getTaskAttemptID()));
if (containerTask.shouldDie()) {
return;
}
taskCount++;
// Reset FileSystem statistics
FileSystem.clearStatistics();
// Re-use the UGI only if the Credentials have not changed.
if (containerTask.haveCredentialsChanged()) {
LOG.info("Refreshing UGI since Credentials have changed");
Credentials taskCreds = containerTask.getCredentials();
if (taskCreds != null) {
LOG.info("Credentials : #Tokens=" + taskCreds.numberOfTokens() + ", #SecretKeys="
+ taskCreds.numberOfSecretKeys());
childUGI = UserGroupInformation.createRemoteUser(System
.getenv(ApplicationConstants.Environment.USER.toString()));
childUGI.addCredentials(containerTask.getCredentials());
} else {
LOG.info("Not loading any credentials, since no credentials provided");
}
}
Map<String, TezLocalResource> additionalResources = containerTask.getAdditionalResources();
if (LOG.isDebugEnabled()) {
LOG.debug("Additional Resources added to container: " + additionalResources);
}
LOG.info("Localizing additional local resources for Task : " + additionalResources);
List<URL> downloadedUrls = RelocalizationUtils.processAdditionalResources(
Maps.transformValues(additionalResources, new Function<TezLocalResource, URI>() {
@Override
public URI apply(TezLocalResource input) {
return input.getUri();
}
}), defaultConf);
RelocalizationUtils.addUrlsToClassPath(downloadedUrls);
LOG.info("Done localizing additional resources");
final TaskSpec taskSpec = containerTask.getTaskSpec();
if (LOG.isDebugEnabled()) {
LOG.debug("New container task context:"
+ taskSpec.toString());
}
try {
taskLock.writeLock().lock();
currentTaskAttemptID = taskSpec.getTaskAttemptID();
TezVertexID newVertexId =
currentTaskAttemptID.getTaskID().getVertexID();
currentTaskComplete.set(false);
if (lastVertexId != null) {
if (!lastVertexId.equals(newVertexId)) {
objectRegistry.clearCache(ObjectLifeCycle.VERTEX);
}
if (!lastVertexId.getDAGId().equals(newVertexId.getDAGId())) {
objectRegistry.clearCache(ObjectLifeCycle.DAG);
startedInputsMap = HashMultimap.create();
}
}
lastVertexId = newVertexId;
TezUtils.updateLoggers(currentTaskAttemptID.toString());