/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.nephele.jobmanager;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import eu.stratosphere.nephele.managementgraph.ManagementVertexID;
import eu.stratosphere.nephele.taskmanager.TaskKillResult;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import eu.stratosphere.configuration.ConfigConstants;
import eu.stratosphere.configuration.Configuration;
import eu.stratosphere.configuration.GlobalConfiguration;
import eu.stratosphere.core.io.StringRecord;
import eu.stratosphere.nephele.client.AbstractJobResult;
import eu.stratosphere.nephele.client.AbstractJobResult.ReturnCode;
import eu.stratosphere.nephele.client.JobCancelResult;
import eu.stratosphere.nephele.client.JobProgressResult;
import eu.stratosphere.nephele.client.JobSubmissionResult;
import eu.stratosphere.nephele.deployment.TaskDeploymentDescriptor;
import eu.stratosphere.nephele.event.job.AbstractEvent;
import eu.stratosphere.nephele.event.job.RecentJobEvent;
import eu.stratosphere.nephele.execution.ExecutionState;
import eu.stratosphere.nephele.execution.librarycache.LibraryCacheManager;
import eu.stratosphere.nephele.executiongraph.ExecutionEdge;
import eu.stratosphere.nephele.executiongraph.ExecutionGraph;
import eu.stratosphere.nephele.executiongraph.ExecutionGraphIterator;
import eu.stratosphere.nephele.executiongraph.ExecutionVertex;
import eu.stratosphere.nephele.executiongraph.ExecutionVertexID;
import eu.stratosphere.nephele.executiongraph.GraphConversionException;
import eu.stratosphere.nephele.executiongraph.InternalJobStatus;
import eu.stratosphere.nephele.executiongraph.JobStatusListener;
import eu.stratosphere.nephele.instance.AbstractInstance;
import eu.stratosphere.nephele.instance.DummyInstance;
import eu.stratosphere.nephele.instance.HardwareDescription;
import eu.stratosphere.nephele.instance.InstanceConnectionInfo;
import eu.stratosphere.nephele.instance.InstanceManager;
import eu.stratosphere.nephele.instance.InstanceType;
import eu.stratosphere.nephele.instance.InstanceTypeDescription;
import eu.stratosphere.nephele.instance.local.LocalInstanceManager;
import eu.stratosphere.runtime.io.channels.ChannelID;
import eu.stratosphere.nephele.ipc.RPC;
import eu.stratosphere.nephele.ipc.Server;
import eu.stratosphere.nephele.jobgraph.AbstractJobVertex;
import eu.stratosphere.nephele.jobgraph.JobGraph;
import eu.stratosphere.nephele.jobgraph.JobID;
import eu.stratosphere.nephele.jobmanager.accumulators.AccumulatorManager;
import eu.stratosphere.nephele.jobmanager.archive.ArchiveListener;
import eu.stratosphere.nephele.jobmanager.archive.MemoryArchivist;
import eu.stratosphere.nephele.jobmanager.scheduler.AbstractScheduler;
import eu.stratosphere.nephele.jobmanager.scheduler.SchedulingException;
import eu.stratosphere.nephele.jobmanager.splitassigner.InputSplitManager;
import eu.stratosphere.nephele.jobmanager.splitassigner.InputSplitWrapper;
import eu.stratosphere.nephele.jobmanager.web.WebInfoServer;
import eu.stratosphere.nephele.managementgraph.ManagementGraph;
import eu.stratosphere.nephele.profiling.JobManagerProfiler;
import eu.stratosphere.nephele.profiling.ProfilingUtils;
import eu.stratosphere.nephele.protocols.AccumulatorProtocol;
import eu.stratosphere.nephele.protocols.ChannelLookupProtocol;
import eu.stratosphere.nephele.protocols.ExtendedManagementProtocol;
import eu.stratosphere.nephele.protocols.InputSplitProviderProtocol;
import eu.stratosphere.nephele.protocols.JobManagerProtocol;
import eu.stratosphere.nephele.services.accumulators.AccumulatorEvent;
import eu.stratosphere.nephele.taskmanager.AbstractTaskResult;
import eu.stratosphere.nephele.taskmanager.TaskCancelResult;
import eu.stratosphere.nephele.taskmanager.TaskExecutionState;
import eu.stratosphere.nephele.taskmanager.TaskSubmissionResult;
import eu.stratosphere.runtime.io.network.ConnectionInfoLookupResponse;
import eu.stratosphere.runtime.io.network.RemoteReceiver;
import eu.stratosphere.nephele.taskmanager.ExecutorThreadFactory;
import eu.stratosphere.nephele.topology.NetworkTopology;
import eu.stratosphere.nephele.types.IntegerRecord;
import eu.stratosphere.nephele.util.SerializableArrayList;
import eu.stratosphere.util.StringUtils;
/**
* In Nephele the job manager is the central component for communication with clients, creating
* schedules for incoming jobs and supervise their execution. A job manager may only exist once in
* the system and its address must be known the clients.
* Task managers can discover the job manager by means of an UDP broadcast and afterwards advertise
* themselves as new workers for tasks.
*
*/
public class JobManager implements DeploymentManager, ExtendedManagementProtocol, InputSplitProviderProtocol,
JobManagerProtocol, ChannelLookupProtocol, JobStatusListener, AccumulatorProtocol
{
public static enum ExecutionMode { LOCAL, CLUSTER }
// --------------------------------------------------------------------------------------------
private static final Log LOG = LogFactory.getLog(JobManager.class);
private final Server jobManagerServer;
private final JobManagerProfiler profiler;
private final EventCollector eventCollector;
private final ArchiveListener archive;
private final InputSplitManager inputSplitManager;
private final AbstractScheduler scheduler;
private AccumulatorManager accumulatorManager;
private InstanceManager instanceManager;
private final int recommendedClientPollingInterval;
private final ExecutorService executorService = Executors.newCachedThreadPool(ExecutorThreadFactory.INSTANCE);
private final static int FAILURE_RETURN_CODE = 1;
private final AtomicBoolean isShutdownInProgress = new AtomicBoolean(false);
private volatile boolean isShutDown;
private WebInfoServer server;
public JobManager(ExecutionMode executionMode) throws Exception {
final String ipcAddressString = GlobalConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null);
InetAddress ipcAddress = null;
if (ipcAddressString != null) {
try {
ipcAddress = InetAddress.getByName(ipcAddressString);
} catch (UnknownHostException e) {
throw new Exception("Cannot convert " + ipcAddressString + " to an IP address: " + e.getMessage(), e);
}
}
final int ipcPort = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
// Read the suggested client polling interval
this.recommendedClientPollingInterval = GlobalConfiguration.getInteger(
ConfigConstants.JOBCLIENT_POLLING_INTERVAL_KEY, ConfigConstants.DEFAULT_JOBCLIENT_POLLING_INTERVAL);
// Load the job progress collector
this.eventCollector = new EventCollector(this.recommendedClientPollingInterval);
// Register simple job archive
int archived_items = GlobalConfiguration.getInteger(
ConfigConstants.JOB_MANAGER_WEB_ARCHIVE_COUNT, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_ARCHIVE_COUNT);
if (archived_items > 0) {
this.archive = new MemoryArchivist(archived_items);
this.eventCollector.registerArchivist(archive);
}
else {
this.archive = null;
}
// Create the accumulator manager, with same archiving limit as web
// interface. We need to store the accumulators for at least one job.
// Otherwise they might be deleted before the client requested the
// accumulator results.
this.accumulatorManager = new AccumulatorManager(Math.min(1, archived_items));
// Load the input split manager
this.inputSplitManager = new InputSplitManager();
// Determine own RPC address
final InetSocketAddress rpcServerAddress = new InetSocketAddress(ipcAddress, ipcPort);
// Start job manager's IPC server
try {
final int handlerCount = GlobalConfiguration.getInteger(ConfigConstants.JOB_MANAGER_IPC_HANDLERS_KEY,
ConfigConstants.DEFAULT_JOB_MANAGER_IPC_HANDLERS);
this.jobManagerServer = RPC.getServer(this, rpcServerAddress.getHostName(), rpcServerAddress.getPort(), handlerCount);
this.jobManagerServer.start();
} catch (IOException e) {
throw new Exception("Cannot start RPC server: " + e.getMessage(), e);
}
LOG.info("Starting job manager in " + executionMode + " mode");
// Try to load the instance manager for the given execution mode
// Try to load the scheduler for the given execution mode
if (executionMode == ExecutionMode.LOCAL) {
try {
this.instanceManager = new LocalInstanceManager();
} catch (Throwable t) {
throw new Exception("Cannot instantiate local instance manager: " + t.getMessage(), t);
}
} else {
final String instanceManagerClassName = JobManagerUtils.getInstanceManagerClassName(executionMode);
LOG.info("Trying to load " + instanceManagerClassName + " as instance manager");
this.instanceManager = JobManagerUtils.loadInstanceManager(instanceManagerClassName);
if (this.instanceManager == null) {
throw new Exception("Unable to load instance manager " + instanceManagerClassName);
}
}
// Try to load the scheduler for the given execution mode
final String schedulerClassName = JobManagerUtils.getSchedulerClassName(executionMode);
LOG.info("Trying to load " + schedulerClassName + " as scheduler");
// Try to get the instance manager class name
this.scheduler = JobManagerUtils.loadScheduler(schedulerClassName, this, this.instanceManager);
if (this.scheduler == null) {
throw new Exception("Unable to load scheduler " + schedulerClassName);
}
// Load profiler if it should be used
if (GlobalConfiguration.getBoolean(ProfilingUtils.ENABLE_PROFILING_KEY, false)) {
final String profilerClassName = GlobalConfiguration.getString(ProfilingUtils.JOBMANAGER_CLASSNAME_KEY,
"eu.stratosphere.nephele.profiling.impl.JobManagerProfilerImpl");
this.profiler = ProfilingUtils.loadJobManagerProfiler(profilerClassName, ipcAddress);
if (this.profiler == null) {
throw new Exception("Cannot load profiler");
}
} else {
this.profiler = null;
LOG.debug("Profiler disabled");
}
}
public void shutdown() {
if (!this.isShutdownInProgress.compareAndSet(false, true)) {
return;
}
// Stop instance manager
if (this.instanceManager != null) {
this.instanceManager.shutdown();
}
// Stop profiling if enabled
if (this.profiler != null) {
this.profiler.shutdown();
}
// Stop RPC server
if (this.jobManagerServer != null) {
this.jobManagerServer.stop();
}
// Stop the executor service
if (this.executorService != null) {
this.executorService.shutdown();
try {
this.executorService.awaitTermination(5000L, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
LOG.debug(e);
}
}
// Stop and clean up the job progress collector
if (this.eventCollector != null) {
this.eventCollector.shutdown();
}
// Finally, shut down the scheduler
if (this.scheduler != null) {
this.scheduler.shutdown();
}
this.isShutDown = true;
LOG.debug("Shutdown of job manager completed");
}
/**
* Log Stratosphere version information.
*/
private static void logVersionInformation() {
String version = JobManager.class.getPackage().getImplementationVersion();
// if version == null, then the JobManager runs from inside the IDE (or somehow not from the maven build jar)
String revision = "<unknown>";
try {
Properties properties = new Properties();
InputStream propFile = JobManager.class.getClassLoader().getResourceAsStream(".version.properties");
if (propFile != null) {
properties.load(propFile);
revision = properties.getProperty("git.commit.id.abbrev");
}
} catch (IOException e) {
LOG.info("Cannot determine code revision. Unable ro read version property file.");
}
LOG.info("Starting Stratosphere JobManager (Version: " + version + ", Rev:" + revision + ")");
}
/**
* Entry point for the program
*
* @param args
* arguments from the command line
*/
public static void main(String[] args) {
// determine if a valid log4j config exists and initialize a default logger if not
if (System.getProperty("log4j.configuration") == null) {
Logger root = Logger.getRootLogger();
root.removeAllAppenders();
PatternLayout layout = new PatternLayout("%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n");
ConsoleAppender appender = new ConsoleAppender(layout, "System.err");
root.addAppender(appender);
root.setLevel(Level.INFO);
}
JobManager jobManager;
try {
jobManager = initialize(args);
// Start info server for jobmanager
jobManager.startInfoServer();
}
catch (Exception e) {
LOG.fatal(e.getMessage(), e);
System.exit(FAILURE_RETURN_CODE);
}
// Clean up is triggered through a shutdown hook
// freeze this thread to keep the JVM alive (the job manager threads are daemon threads)
Object w = new Object();
synchronized (w) {
try {
w.wait();
} catch (InterruptedException e) {}
}
}
@SuppressWarnings("static-access")
public static JobManager initialize(String[] args) throws Exception {
// output the version and revision information to the log
logVersionInformation();
final Option configDirOpt = OptionBuilder.withArgName("config directory").hasArg()
.withDescription("Specify configuration directory.").create("configDir");
final Option executionModeOpt = OptionBuilder.withArgName("execution mode").hasArg()
.withDescription("Specify execution mode.").create("executionMode");
final Options options = new Options();
options.addOption(configDirOpt);
options.addOption(executionModeOpt);
CommandLineParser parser = new GnuParser();
CommandLine line = null;
try {
line = parser.parse(options, args);
} catch (ParseException e) {
LOG.error("CLI Parsing failed. Reason: " + e.getMessage());
System.exit(FAILURE_RETURN_CODE);
}
final String configDir = line.getOptionValue(configDirOpt.getOpt(), null);
final String executionModeName = line.getOptionValue(executionModeOpt.getOpt(), "local");
ExecutionMode executionMode = null;
if ("local".equals(executionModeName)) {
executionMode = ExecutionMode.LOCAL;
} else if ("cluster".equals(executionModeName)) {
executionMode = ExecutionMode.CLUSTER;
} else {
System.err.println("Unrecognized execution mode: " + executionModeName);
System.exit(FAILURE_RETURN_CODE);
}
// First, try to load global configuration
GlobalConfiguration.loadConfiguration(configDir);
// Create a new job manager object
JobManager jobManager = new JobManager(executionMode);
// Set base dir for info server
Configuration infoserverConfig = GlobalConfiguration.getConfiguration();
if (configDir != null && new File(configDir).isDirectory()) {
infoserverConfig.setString(ConfigConstants.STRATOSPHERE_BASE_DIR_PATH_KEY, configDir+"/..");
}
GlobalConfiguration.includeConfiguration(infoserverConfig);
return jobManager;
}
@Override
public JobSubmissionResult submitJob(JobGraph job) throws IOException {
try {
// First check if job is null
if (job == null) {
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
"Submitted job is null!");
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Submitted job " + job.getName() + " is not null");
}
// Check if any vertex of the graph has null edges
AbstractJobVertex jv = job.findVertexWithNullEdges();
if (jv != null) {
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, "Vertex "
+ jv.getName() + " has at least one null edge");
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Submitted job " + job.getName() + " has no null edges");
}
// Next, check if the graph is weakly connected
if (!job.isWeaklyConnected()) {
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
"Job graph is not weakly connected");
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug("The graph of job " + job.getName() + " is weakly connected");
}
// Check if job graph has cycles
if (!job.isAcyclic()) {
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
"Job graph is not a DAG");
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug("The graph of job " + job.getName() + " is acyclic");
}
// Check constrains on degree
jv = job.areVertexDegreesCorrect();
if (jv != null) {
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
"Degree of vertex " + jv.getName() + " is incorrect");
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug("All vertices of job " + job.getName() + " have the correct degree");
}
if (!job.isInstanceDependencyChainAcyclic()) {
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR,
"The dependency chain for instance sharing contains a cycle");
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug("The dependency chain for instance sharing is acyclic");
}
// Check if the job will be executed with profiling enabled
boolean jobRunsWithProfiling = false;
if (this.profiler != null && job.getJobConfiguration().getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) {
jobRunsWithProfiling = true;
}
// Try to create initial execution graph from job graph
LOG.info("Creating initial execution graph from job graph " + job.getName());
ExecutionGraph eg;
try {
eg = new ExecutionGraph(job, this.instanceManager);
} catch (GraphConversionException e) {
if (e.getCause() == null) {
return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(e));
} else {
Throwable t = e.getCause();
if (t instanceof FileNotFoundException) {
return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, t.getMessage());
} else {
return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(t));
}
}
}
// Register job with the progress collector
if (this.eventCollector != null) {
this.eventCollector.registerJob(eg, jobRunsWithProfiling, System.currentTimeMillis());
}
// Check if profiling should be enabled for this job
if (jobRunsWithProfiling) {
this.profiler.registerProfilingJob(eg);
if (this.eventCollector != null) {
this.profiler.registerForProfilingData(eg.getJobID(), this.eventCollector);
}
}
// Register job with the dynamic input split assigner
this.inputSplitManager.registerJob(eg);
// Register for updates on the job status
eg.registerJobStatusListener(this);
// Schedule job
if (LOG.isInfoEnabled()) {
LOG.info("Scheduling job " + job.getName());
}
try {
this.scheduler.schedulJob(eg);
} catch (SchedulingException e) {
unregisterJob(eg);
JobSubmissionResult result = new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(e));
return result;
}
// Return on success
return new JobSubmissionResult(AbstractJobResult.ReturnCode.SUCCESS, null);
}
catch (Throwable t) {
LOG.error("Job submission failed.", t);
return new JobSubmissionResult(AbstractJobResult.ReturnCode.ERROR, StringUtils.stringifyException(t));
}
}
public InstanceManager getInstanceManager() {
return this.instanceManager;
}
/**
* This method is a convenience method to unregister a job from all of
* Nephele's monitoring, profiling and optimization components at once.
* Currently, it is only being used to unregister from profiling (if activated).
*
* @param executionGraph
* the execution graph to remove from the job manager
*/
private void unregisterJob(final ExecutionGraph executionGraph) {
// Remove job from profiler (if activated)
if (this.profiler != null
&& executionGraph.getJobConfiguration().getBoolean(ProfilingUtils.PROFILE_JOB_KEY, true)) {
this.profiler.unregisterProfilingJob(executionGraph);
if (this.eventCollector != null) {
this.profiler.unregisterFromProfilingData(executionGraph.getJobID(), this.eventCollector);
}
}
// Cancel all pending requests for instances
this.instanceManager.cancelPendingRequests(executionGraph.getJobID()); // getJobID is final member, no
// synchronization necessary
// Remove job from input split manager
if (this.inputSplitManager != null) {
this.inputSplitManager.unregisterJob(executionGraph);
}
// Unregister job with library cache manager
try {
LibraryCacheManager.unregister(executionGraph.getJobID());
} catch (IOException ioe) {
if (LOG.isWarnEnabled()) {
LOG.warn(ioe);
}
}
}
@Override
public void sendHeartbeat(final InstanceConnectionInfo instanceConnectionInfo,
final HardwareDescription hardwareDescription) {
// Delegate call to instance manager
if (this.instanceManager != null) {
final Runnable heartBeatRunnable = new Runnable() {
@Override
public void run() {
instanceManager.reportHeartBeat(instanceConnectionInfo, hardwareDescription);
}
};
this.executorService.execute(heartBeatRunnable);
}
}
@Override
public void updateTaskExecutionState(final TaskExecutionState executionState) throws IOException {
// Ignore calls with executionResult == null
if (executionState == null) {
LOG.error("Received call to updateTaskExecutionState with executionState == null");
return;
}
if (executionState.getExecutionState() == ExecutionState.FAILED) {
LOG.error(executionState.getDescription());
}
final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(executionState.getJobID());
if (eg == null) {
LOG.error("Cannot find execution graph for ID " + executionState.getJobID() + " to change state to "
+ executionState.getExecutionState());
return;
}
final ExecutionVertex vertex = eg.getVertexByID(executionState.getID());
if (vertex == null) {
LOG.error("Cannot find vertex with ID " + executionState.getID() + " of job " + eg.getJobID()
+ " to change state to " + executionState.getExecutionState());
return;
}
// Asynchronously update execute state of vertex
vertex.updateExecutionStateAsynchronously(executionState.getExecutionState(), executionState.getDescription());
}
@Override
public JobCancelResult cancelJob(final JobID jobID) throws IOException {
LOG.info("Trying to cancel job with ID " + jobID);
final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
if (eg == null) {
return new JobCancelResult(ReturnCode.ERROR, "Cannot find job with ID " + jobID);
}
final Runnable cancelJobRunnable = new Runnable() {
@Override
public void run() {
eg.updateJobStatus(InternalJobStatus.CANCELING, "Job canceled by user");
final TaskCancelResult cancelResult = cancelJob(eg);
if (cancelResult != null) {
LOG.error(cancelResult.getDescription());
}
}
};
eg.executeCommand(cancelJobRunnable);
LOG.info("Cancel of job " + jobID + " successfully triggered");
return new JobCancelResult(AbstractJobResult.ReturnCode.SUCCESS, null);
}
/**
* Cancels all the tasks in the current and upper stages of the
* given execution graph.
*
* @param eg
* the execution graph representing the job to cancel.
* @return <code>null</code> if no error occurred during the cancel attempt,
* otherwise the returned object will describe the error
*/
private TaskCancelResult cancelJob(final ExecutionGraph eg) {
TaskCancelResult errorResult = null;
/**
* Cancel all nodes in the current and upper execution stages.
*/
final Iterator<ExecutionVertex> it = new ExecutionGraphIterator(eg, eg.getIndexOfCurrentExecutionStage(),
false, true);
while (it.hasNext()) {
final ExecutionVertex vertex = it.next();
final TaskCancelResult result = vertex.cancelTask();
if (result.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) {
errorResult = result;
}
}
return errorResult;
}
@Override
public JobProgressResult getJobProgress(final JobID jobID) throws IOException {
if (this.eventCollector == null) {
return new JobProgressResult(ReturnCode.ERROR, "JobManager does not support progress reports for jobs",
null);
}
final SerializableArrayList<AbstractEvent> eventList = new SerializableArrayList<AbstractEvent>();
this.eventCollector.getEventsForJob(jobID, eventList, false);
return new JobProgressResult(ReturnCode.SUCCESS, null, eventList);
}
@Override
public ConnectionInfoLookupResponse lookupConnectionInfo(InstanceConnectionInfo caller, JobID jobID, ChannelID sourceChannelID) {
final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
if (eg == null) {
LOG.error("Cannot find execution graph to job ID " + jobID);
return ConnectionInfoLookupResponse.createReceiverNotFound();
}
final InternalJobStatus jobStatus = eg.getJobStatus();
if (jobStatus == InternalJobStatus.FAILING || jobStatus == InternalJobStatus.CANCELING) {
return ConnectionInfoLookupResponse.createJobIsAborting();
}
final ExecutionEdge edge = eg.getEdgeByID(sourceChannelID);
if (edge == null) {
LOG.error("Cannot find execution edge associated with ID " + sourceChannelID);
return ConnectionInfoLookupResponse.createReceiverNotFound();
}
if (sourceChannelID.equals(edge.getInputChannelID())) {
// Request was sent from an input channel
final ExecutionVertex connectedVertex = edge.getOutputGate().getVertex();
final AbstractInstance assignedInstance = connectedVertex.getAllocatedResource().getInstance();
if (assignedInstance == null) {
LOG.error("Cannot resolve lookup: vertex found for channel ID " + edge.getOutputGateIndex()
+ " but no instance assigned");
// LOG.info("Created receiverNotReady for " + connectedVertex + " 1");
return ConnectionInfoLookupResponse.createReceiverNotReady();
}
// Check execution state
final ExecutionState executionState = connectedVertex.getExecutionState();
if (executionState == ExecutionState.FINISHED) {
// that should not happen. if there is data pending, the receiver cannot be ready
return ConnectionInfoLookupResponse.createReceiverNotFound();
}
// running is common, finishing is happens when the lookup is for the close event
if (executionState != ExecutionState.RUNNING && executionState != ExecutionState.FINISHING) {
// LOG.info("Created receiverNotReady for " + connectedVertex + " in state " + executionState + " 2");
return ConnectionInfoLookupResponse.createReceiverNotReady();
}
if (assignedInstance.getInstanceConnectionInfo().equals(caller)) {
// Receiver runs on the same task manager
return ConnectionInfoLookupResponse.createReceiverFoundAndReady(edge.getOutputChannelID());
} else {
// Receiver runs on a different task manager
final InstanceConnectionInfo ici = assignedInstance.getInstanceConnectionInfo();
final InetSocketAddress isa = new InetSocketAddress(ici.address(), ici.dataPort());
return ConnectionInfoLookupResponse.createReceiverFoundAndReady(new RemoteReceiver(isa, edge.getConnectionID()));
}
}
// else, the request is for an output channel
// Find vertex of connected input channel
final ExecutionVertex targetVertex = edge.getInputGate().getVertex();
// Check execution state
final ExecutionState executionState = targetVertex.getExecutionState();
// check whether the task needs to be deployed
if (executionState != ExecutionState.RUNNING && executionState != ExecutionState.FINISHING && executionState != ExecutionState.FINISHED) {
if (executionState == ExecutionState.ASSIGNED) {
final Runnable command = new Runnable() {
@Override
public void run() {
scheduler.deployAssignedVertices(targetVertex);
}
};
eg.executeCommand(command);
}
// LOG.info("Created receiverNotReady for " + targetVertex + " in state " + executionState + " 3");
return ConnectionInfoLookupResponse.createReceiverNotReady();
}
final AbstractInstance assignedInstance = targetVertex.getAllocatedResource().getInstance();
if (assignedInstance == null) {
LOG.error("Cannot resolve lookup: vertex found for channel ID " + edge.getInputChannelID() + " but no instance assigned");
// LOG.info("Created receiverNotReady for " + targetVertex + " in state " + executionState + " 4");
return ConnectionInfoLookupResponse.createReceiverNotReady();
}
if (assignedInstance.getInstanceConnectionInfo().equals(caller)) {
// Receiver runs on the same task manager
return ConnectionInfoLookupResponse.createReceiverFoundAndReady(edge.getInputChannelID());
} else {
// Receiver runs on a different task manager
final InstanceConnectionInfo ici = assignedInstance.getInstanceConnectionInfo();
final InetSocketAddress isa = new InetSocketAddress(ici.address(), ici.dataPort());
return ConnectionInfoLookupResponse.createReceiverFoundAndReady(new RemoteReceiver(isa, edge.getConnectionID()));
}
}
/**
* Returns current ManagementGraph from eventCollector and, if not current, from archive
*
* {@inheritDoc}
*/
@Override
public ManagementGraph getManagementGraph(final JobID jobID) throws IOException {
ManagementGraph mg = this.eventCollector.getManagementGraph(jobID);
if (mg == null) {
if(this.archive != null) {
mg = this.archive.getManagementGraph(jobID);
}
if (mg == null) {
throw new IOException("Cannot find job with ID " + jobID);
}
}
return mg;
}
@Override
public NetworkTopology getNetworkTopology(final JobID jobID) throws IOException {
if (this.instanceManager != null) {
return this.instanceManager.getNetworkTopology(jobID);
}
return null;
}
@Override
public IntegerRecord getRecommendedPollingInterval() throws IOException {
return new IntegerRecord(this.recommendedClientPollingInterval);
}
@Override
public List<RecentJobEvent> getRecentJobs() throws IOException {
final List<RecentJobEvent> eventList = new SerializableArrayList<RecentJobEvent>();
if (this.eventCollector == null) {
throw new IOException("No instance of the event collector found");
}
this.eventCollector.getRecentJobs(eventList);
return eventList;
}
@Override
public List<AbstractEvent> getEvents(final JobID jobID) throws IOException {
final List<AbstractEvent> eventList = new SerializableArrayList<AbstractEvent>();
if (this.eventCollector == null) {
throw new IOException("No instance of the event collector found");
}
this.eventCollector.getEventsForJob(jobID, eventList, true);
return eventList;
}
@Override
public void killTask(final JobID jobID, final ManagementVertexID id) throws IOException {
final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
if (eg == null) {
LOG.error("Cannot find execution graph for job " + jobID);
return;
}
final ExecutionVertex vertex = eg.getVertexByID(ExecutionVertexID.fromManagementVertexID(id));
if (vertex == null) {
LOG.error("Cannot find execution vertex with ID " + id);
return;
}
LOG.info("Killing task " + vertex + " of job " + jobID);
final Runnable runnable = new Runnable() {
@Override
public void run() {
final TaskKillResult result = vertex.killTask();
if (result.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) {
LOG.error(result.getDescription());
}
}
};
eg.executeCommand(runnable);
}
@Override
public void killInstance(final StringRecord instanceName) throws IOException {
final AbstractInstance instance = this.instanceManager.getInstanceByName(instanceName.toString());
if (instance == null) {
LOG.error("Cannot find instance with name " + instanceName + " to kill it");
return;
}
LOG.info("Killing task manager on instance " + instance);
final Runnable runnable = new Runnable() {
@Override
public void run() {
try {
instance.killTaskManager();
} catch (IOException ioe) {
LOG.error(ioe);
}
}
};
// Hand it over to the executor service
this.executorService.execute(runnable);
}
/**
* Tests whether the job manager has been shut down completely.
*
* @return <code>true</code> if the job manager has been shut down completely, <code>false</code> otherwise
*/
public boolean isShutDown() {
return this.isShutDown;
}
public Map<InstanceType, InstanceTypeDescription> getMapOfAvailableInstanceTypes() {
// Delegate call to the instance manager
if (this.instanceManager != null) {
return this.instanceManager.getMapOfAvailableInstanceTypes();
}
return null;
}
@Override
public void jobStatusHasChanged(final ExecutionGraph executionGraph, final InternalJobStatus newJobStatus,
final String optionalMessage) {
LOG.info("Status of job " + executionGraph.getJobName() + "(" + executionGraph.getJobID() + ")"
+ " changed to " + newJobStatus);
if (newJobStatus == InternalJobStatus.FAILING) {
// Cancel all remaining tasks
cancelJob(executionGraph);
}
if (newJobStatus == InternalJobStatus.CANCELED || newJobStatus == InternalJobStatus.FAILED
|| newJobStatus == InternalJobStatus.FINISHED) {
// Unregister job for Nephele's monitoring, optimization components, and dynamic input split assignment
unregisterJob(executionGraph);
}
}
@Override
public void logBufferUtilization(final JobID jobID) throws IOException {
final ExecutionGraph eg = this.scheduler.getExecutionGraphByID(jobID);
if (eg == null) {
return;
}
final Set<AbstractInstance> allocatedInstance = new HashSet<AbstractInstance>();
final Iterator<ExecutionVertex> it = new ExecutionGraphIterator(eg, true);
while (it.hasNext()) {
final ExecutionVertex vertex = it.next();
final ExecutionState state = vertex.getExecutionState();
if (state == ExecutionState.RUNNING || state == ExecutionState.FINISHING) {
final AbstractInstance instance = vertex.getAllocatedResource().getInstance();
if (instance instanceof DummyInstance) {
LOG.error("Found instance of type DummyInstance for vertex " + vertex.getName() + " (state "
+ state + ")");
continue;
}
allocatedInstance.add(instance);
}
}
// Send requests to task managers from separate thread
final Runnable requestRunnable = new Runnable() {
@Override
public void run() {
final Iterator<AbstractInstance> it2 = allocatedInstance.iterator();
try {
while (it2.hasNext()) {
it2.next().logBufferUtilization();
}
} catch (IOException ioe) {
LOG.error(ioe);
}
}
};
// Hand over to the executor service
this.executorService.execute(requestRunnable);
}
@Override
public void deploy(final JobID jobID, final AbstractInstance instance,
final List<ExecutionVertex> verticesToBeDeployed) {
if (verticesToBeDeployed.isEmpty()) {
LOG.error("Method 'deploy' called but list of vertices to be deployed is empty");
return;
}
for (final ExecutionVertex vertex : verticesToBeDeployed) {
// Check vertex state
if (vertex.getExecutionState() != ExecutionState.READY) {
LOG.error("Expected vertex " + vertex + " to be in state READY but it is in state "
+ vertex.getExecutionState());
}
vertex.updateExecutionState(ExecutionState.STARTING, null);
}
// Create a new runnable and pass it the executor service
final Runnable deploymentRunnable = new Runnable() {
/**
* {@inheritDoc}
*/
@Override
public void run() {
// Check if all required libraries are available on the instance
try {
instance.checkLibraryAvailability(jobID);
} catch (IOException ioe) {
LOG.error("Cannot check library availability: " + StringUtils.stringifyException(ioe));
}
final List<TaskDeploymentDescriptor> submissionList = new SerializableArrayList<TaskDeploymentDescriptor>();
// Check the consistency of the call
for (final ExecutionVertex vertex : verticesToBeDeployed) {
submissionList.add(vertex.constructDeploymentDescriptor());
LOG.info("Starting task " + vertex + " on " + vertex.getAllocatedResource().getInstance());
}
List<TaskSubmissionResult> submissionResultList = null;
try {
submissionResultList = instance.submitTasks(submissionList);
} catch (final IOException ioe) {
final String errorMsg = StringUtils.stringifyException(ioe);
for (final ExecutionVertex vertex : verticesToBeDeployed) {
vertex.updateExecutionStateAsynchronously(ExecutionState.FAILED, errorMsg);
}
}
if (verticesToBeDeployed.size() != submissionResultList.size()) {
LOG.error("size of submission result list does not match size of list with vertices to be deployed");
}
int count = 0;
for (final TaskSubmissionResult tsr : submissionResultList) {
ExecutionVertex vertex = verticesToBeDeployed.get(count++);
if (!vertex.getID().equals(tsr.getVertexID())) {
LOG.error("Expected different order of objects in task result list");
vertex = null;
for (final ExecutionVertex candVertex : verticesToBeDeployed) {
if (tsr.getVertexID().equals(candVertex.getID())) {
vertex = candVertex;
break;
}
}
if (vertex == null) {
LOG.error("Cannot find execution vertex for vertex ID " + tsr.getVertexID());
continue;
}
}
if (tsr.getReturnCode() != AbstractTaskResult.ReturnCode.SUCCESS) {
// Change the execution state to failed and let the scheduler deal with the rest
vertex.updateExecutionStateAsynchronously(ExecutionState.FAILED, tsr.getDescription());
}
}
}
};
this.executorService.execute(deploymentRunnable);
}
@Override
public InputSplitWrapper requestNextInputSplit(final JobID jobID, final ExecutionVertexID vertexID,
final IntegerRecord sequenceNumber) throws IOException {
final ExecutionGraph graph = this.scheduler.getExecutionGraphByID(jobID);
if (graph == null) {
LOG.error("Cannot find execution graph to job ID " + jobID);
return null;
}
final ExecutionVertex vertex = graph.getVertexByID(vertexID);
if (vertex == null) {
LOG.error("Cannot find execution vertex for vertex ID " + vertexID);
return null;
}
return new InputSplitWrapper(jobID, this.inputSplitManager.getNextInputSplit(vertex, sequenceNumber.getValue()));
}
/**
* Starts the Jetty Infoserver for the Jobmanager
*
*/
public void startInfoServer() {
final Configuration config = GlobalConfiguration.getConfiguration();
// Start InfoServer
try {
int port = config.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_FRONTEND_PORT);
server = new WebInfoServer(config, port, this);
server.start();
} catch (FileNotFoundException e) {
LOG.error(e.getMessage(), e);
} catch (Exception e) {
LOG.error("Cannot instantiate info server: " + e.getMessage(), e);
}
}
// TODO Add to RPC?
public List<RecentJobEvent> getOldJobs() throws IOException {
//final List<RecentJobEvent> eventList = new SerializableArrayList<RecentJobEvent>();
if (this.archive == null) {
throw new IOException("No instance of the event collector found");
}
//this.eventCollector.getRecentJobs(eventList);
return this.archive.getJobs();
}
public ArchiveListener getArchive() {
return this.archive;
}
public int getNumberOfTaskTrackers() {
return this.instanceManager.getNumberOfTaskTrackers();
}
@Override
public void reportAccumulatorResult(AccumulatorEvent accumulatorEvent) throws IOException {
this.accumulatorManager.processIncomingAccumulators(accumulatorEvent.getJobID(),
accumulatorEvent.getAccumulators());
}
@Override
public AccumulatorEvent getAccumulatorResults(JobID jobID) throws IOException {
return new AccumulatorEvent(jobID, this.accumulatorManager.getJobAccumulators(jobID), false);
}
}