Package eu.stratosphere.api.java

Source Code of eu.stratosphere.api.java.ExecutionEnvironment

/***********************************************************************************************************************
*
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*
**********************************************************************************************************************/
package eu.stratosphere.api.java;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;

import org.apache.commons.lang3.Validate;

import eu.stratosphere.api.common.InvalidProgramException;
import eu.stratosphere.api.common.JobExecutionResult;
import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.io.InputFormat;
import eu.stratosphere.api.java.io.CollectionInputFormat;
import eu.stratosphere.api.java.io.CsvReader;
import eu.stratosphere.api.java.io.IteratorInputFormat;
import eu.stratosphere.api.java.io.ParallelIteratorInputFormat;
import eu.stratosphere.api.java.io.TextInputFormat;
import eu.stratosphere.api.java.io.TextValueInputFormat;
import eu.stratosphere.api.java.operators.DataSink;
import eu.stratosphere.api.java.operators.DataSource;
import eu.stratosphere.api.java.operators.Operator;
import eu.stratosphere.api.java.operators.OperatorTranslation;
import eu.stratosphere.api.java.operators.translation.JavaPlan;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.api.java.typeutils.BasicTypeInfo;
import eu.stratosphere.api.java.typeutils.ResultTypeQueryable;
import eu.stratosphere.api.java.typeutils.TypeExtractor;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.api.java.typeutils.ValueTypeInfo;
import eu.stratosphere.core.fs.Path;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.util.NumberSequenceIterator;
import eu.stratosphere.util.SplittableIterator;

/**
* The ExecutionEnviroment is the context in which a program is executed. A
* {@link LocalEnvironment} will cause execution in the current JVM, a
* {@link RemoteEnvironment} will cause execution on a remote setup.
* <p>
* The environment provides methods to control the job execution (such as setting the parallelism)
* and to interact with the outside world (data access).
* <p>
* Please note that the execution environment needs strong type information for the input and return types
* of all operations that are executed. This means that the environments needs to know that the return
* value of an operation is for example a Tuple of String and Integer.
* Because the Java compiler throws much of the generic type information away, most methods attempt to re-
* obtain that information using reflection. In certain cases, it may be necessary to manually supply that
* information to some of the methods.
*
* @see LocalEnvironment
* @see RemoteEnvironment
*/
public abstract class ExecutionEnvironment {
 
  /** The environment of the context (local by default, cluster if invoked through command line) */
  private static ExecutionEnvironment contextEnvironment;
 
  /** The default parallelism used by local environments */
  private static int defaultLocalDop = Runtime.getRuntime().availableProcessors();
 
  /** flag to disable local executor when using the ContextEnvironment */
  private static boolean allowLocalExecution = true;
 
  // --------------------------------------------------------------------------------------------
 
  private final UUID executionId;
 
  private final List<DataSink<?>> sinks = new ArrayList<DataSink<?>>();
 
  private final List<Tuple2<String, String>> cacheFile = new ArrayList<Tuple2<String, String>>();

  private int degreeOfParallelism = -1;
 
 
  // --------------------------------------------------------------------------------------------
  //  Constructor and Properties
  // --------------------------------------------------------------------------------------------
 
  /**
   * Creates a new Execution Environment.
   */
  protected ExecutionEnvironment() {
    this.executionId = UUID.randomUUID();
  }
 
  /**
   * Gets the degree of parallelism with which operation are executed by default. Operations can
   * individually override this value to use a specific degree of parallelism via
   * {@link Operator#setParallelism(int)}. Other operations may need to run with a different
   * degree of parallelism - for example calling
   * {@link DataSet#reduce(eu.stratosphere.api.java.functions.ReduceFunction)} over the entire
   * set will insert eventually an operation that runs non-parallel (degree of parallelism of one).
   *
   * @return The degree of parallelism used by operations, unless they override that value. This method
   *         returns {@code -1}, if the environments default parallelism should be used.
   */
  public int getDegreeOfParallelism() {
    return degreeOfParallelism;
  }
 
  /**
   * Sets the degree of parallelism (DOP) for operations executed through this environment.
   * Setting a DOP of x here will cause all operators (such as join, map, reduce) to run with
   * x parallel instances.
   * <p>
   * This method overrides the default parallelism for this environment.
   * The {@link LocalEnvironment} uses by default a value equal to the number of hardware
   * contexts (CPU cores / threads). When executing the program via the command line client
   * from a JAR file, the default degree of parallelism is the one configured for that setup.
   *
   * @param degreeOfParallelism The degree of parallelism
   */
  public void setDegreeOfParallelism(int degreeOfParallelism) {
    if (degreeOfParallelism < 1) {
      throw new IllegalArgumentException("Degree of parallelism must be at least one.");
    }
   
    this.degreeOfParallelism = degreeOfParallelism;
  }
 
  /**
   * Gets the UUID by which this environment is identified. The UUID sets the execution context
   * in the cluster or local environment.
   *
   * @return The UUID of this environment.
   * @see #getIdString()
   */
  public UUID getId() {
    return this.executionId;
  }
 
  /**
   * Gets the UUID by which this environment is identified, as a string.
   *
   * @return The UUID as a string.
   * @see #getId()
   */
  public String getIdString() {
    return this.executionId.toString();
  }
 
  // --------------------------------------------------------------------------------------------
  //  Data set creations
  // --------------------------------------------------------------------------------------------

  // ---------------------------------- Text Input Format ---------------------------------------
 
  /**
   * Creates a DataSet that represents the Strings produced by reading the given file line wise.
   * The file will be read with the system's default character set.
   *
   * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
   * @return A DataSet that represents the data read from the given file as text lines.
   */
  public DataSource<String> readTextFile(String filePath) {
    Validate.notNull(filePath, "The file path may not be null.");
   
    return new DataSource<String>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO );
  }
 
  /**
   * Creates a DataSet that represents the Strings produced by reading the given file line wise.
   * The {@link java.nio.charset.Charset} with the given name will be used to read the files.
   *
   * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
   * @param charsetName The name of the character set used to read the file.
   * @return A DataSet that represents the data read from the given file as text lines.
   */
  public DataSource<String> readTextFile(String filePath, String charsetName) {
    Validate.notNull(filePath, "The file path may not be null.");

    TextInputFormat format = new TextInputFormat(new Path(filePath));
    format.setCharsetName(charsetName);
    return new DataSource<String>(this, format, BasicTypeInfo.STRING_TYPE_INFO );
  }
 
  // -------------------------- Text Input Format With String Value------------------------------
 
  /**
   * Creates a DataSet that represents the Strings produced by reading the given file line wise.
   * This method is similar to {@link #readTextFile(String)}, but it produces a DataSet with mutable
   * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations
   * to be less object and garbage collection heavy.
   * <p>
   * The file will be read with the system's default character set.
   *
   * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
   * @return A DataSet that represents the data read from the given file as text lines.
   */
  public DataSource<StringValue> readTextFileWithValue(String filePath) {
    Validate.notNull(filePath, "The file path may not be null.");
   
    return new DataSource<StringValue>(this, new TextValueInputFormat(new Path(filePath)), new ValueTypeInfo<StringValue>(StringValue.class) );
  }
 
  /**
   * Creates a DataSet that represents the Strings produced by reading the given file line wise.
   * This method is similar to {@link #readTextFile(String, String)}, but it produces a DataSet with mutable
   * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations
   * to be less object and garbage collection heavy.
   * <p>
   * The {@link java.nio.charset.Charset} with the given name will be used to read the files.
   *
   * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
   * @param charsetName The name of the character set used to read the file.
   * @param skipInvalidLines A flag to indicate whether to skip lines that cannot be read with the given character set.
   *
   * @return A DataSet that represents the data read from the given file as text lines.
   */
  public DataSource<StringValue> readTextFileWithValue(String filePath, String charsetName, boolean skipInvalidLines) {
    Validate.notNull(filePath, "The file path may not be null.");
   
    TextValueInputFormat format = new TextValueInputFormat(new Path(filePath));
    format.setCharsetName(charsetName);
    format.setSkipInvalidLines(skipInvalidLines);
    return new DataSource<StringValue>(this, format, new ValueTypeInfo<StringValue>(StringValue.class) );
  }
 
  // ----------------------------------- CSV Input Format ---------------------------------------
 
  /**
   * Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to
   * define parameters and field types and will eventually produce the DataSet that corresponds to
   * the read and parsed CSV input.
   *
   * @param filePath The path of the CSV file.
   * @return A CsvReader that can be used to configure the CSV input.
   */
  public CsvReader readCsvFile(String filePath) {
    return new CsvReader(filePath, this);
  }
 
  // ----------------------------------- Generic Input Format ---------------------------------------
 
  /**
   * Generic method to create an input DataSet with in {@link InputFormat}. The DataSet will not be
   * immediately created - instead, this method returns a DataSet that will be lazily created from
   * the input format once the program is executed.
   * <p>
   * Since all data sets need specific information about their types, this method needs to determine
   * the type of the data produced by the input format. It will attempt to determine the data type
   * by reflection, unless the the input format implements the {@link ResultTypeQueryable} interface.
   * In the latter case, this method will invoke the {@link ResultTypeQueryable#getProducedType()}
   * method to determine data type produced by the input format.
   *
   * @param inputFormat The input format used to create the data set.
   * @return A DataSet that represents the data created by the input format.
   *
   * @see #createInput(InputFormat, TypeInformation)
   */
  public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat) {
    if (inputFormat == null) {
      throw new IllegalArgumentException("InputFormat must not be null.");
    }
   
    try {
      return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
    }
    catch (Exception e) {
      throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
          "Please specify the TypeInformation of the produced type explicitly.");
    }
  }

  /**
   * Generic method to create an input DataSet with in {@link InputFormat}. The DataSet will not be
   * immediately created - instead, this method returns a DataSet that will be lazily created from
   * the input format once the program is executed.
   * <p>
   * The data set is typed to the given TypeInformation. This method is intended for input formats that
   * where the return type cannot be determined by reflection analysis, and that do not implement the
   * {@link ResultTypeQueryable} interface.
   *
   * @param inputFormat The input format used to create the data set.
   * @return A DataSet that represents the data created by the input format.
   *
   * @see #createInput(InputFormat)
   */
  public <X> DataSource<X> createInput(InputFormat<X, ?> inputFormat, TypeInformation<X> producedType) {
    if (inputFormat == null) {
      throw new IllegalArgumentException("InputFormat must not be null.");
    }
   
    if (producedType == null) {
      throw new IllegalArgumentException("Produced type information must not be null.");
    }
   
    return new DataSource<X>(this, inputFormat, producedType);
  }
 
  // ----------------------------------- Collection ---------------------------------------
 
  /**
   * Creates a DataSet from the given non-empty collection. The type of the data set is that
   * of the elements in the collection. The elements need to be serializable (as defined by
   * {@link java.io.Serializable}), because the framework may move the elements into the cluster
   * if needed.
   * <p>
   * The framework will try and determine the exact type from the collection elements.
   * In case of generic elements, it may be necessary to manually supply the type information
   * via {@link #fromCollection(Collection, TypeInformation)}.
   * <p>
   * Note that this operation will result in a non-parallel data source, i.e. a data source with
   * a degree of parallelism of one.
   *
   * @param data The collection of elements to create the data set from.
   * @return A DataSet representing the given collection.
   *
   * @see #fromCollection(Collection, TypeInformation)
   */
  public <X> DataSource<X> fromCollection(Collection<X> data) {
    if (data == null) {
      throw new IllegalArgumentException("The data must not be null.");
    }
    if (data.size() == 0) {
      throw new IllegalArgumentException("The size of the collection must not be empty.");
    }
   
    X firstValue = data.iterator().next();
   
    return fromCollection(data, TypeExtractor.getForObject(firstValue));
  }
 
  /**
   * Creates a DataSet from the given non-empty collection. The type of the data set is that
   * of the elements in the collection. The elements need to be serializable (as defined by
   * {@link java.io.Serializable}), because the framework may move the elements into the cluster
   * if needed.
   * <p>
   * Note that this operation will result in a non-parallel data source, i.e. a data source with
   * a degree of parallelism of one.
   * <p>
   * The returned DataSet is typed to the given TypeInformation.
   * 
   * @param data The collection of elements to create the data set from.
   * @param type The TypeInformation for the produced data set.
   * @return A DataSet representing the given collection.
   *
   * @see #fromCollection(Collection)
   */
  public <X> DataSource<X> fromCollection(Collection<X> data, TypeInformation<X> type) {
    CollectionInputFormat.checkCollection(data, type.getTypeClass());
   
    return new DataSource<X>(this, new CollectionInputFormat<X>(data), type);
  }
 
  /**
   * Creates a DataSet from the given iterator. Because the iterator will remain unmodified until
   * the actual execution happens, the type of data returned by the iterator must be given
   * explicitly in the form of the type class (this is due to the fact that the Java compiler
   * erases the generic type information).
   * <p>
   * The iterator must be serializable (as defined in {@link java.io.Serializable}), because the
   * framework may move it to a remote environment, if needed.
   * <p>
   * Note that this operation will result in a non-parallel data source, i.e. a data source with
   * a degree of parallelism of one.
   *
   * @param data The collection of elements to create the data set from.
   * @param type The class of the data produced by the iterator. Must not be a generic class.
   * @return A DataSet representing the elements in the iterator.
   *
   * @see #fromCollection(Iterator, TypeInformation)
   */
  public <X> DataSource<X> fromCollection(Iterator<X> data, Class<X> type) {
    return fromCollection(data, TypeExtractor.getForClass(type));
  }
 
  /**
   * Creates a DataSet from the given iterator. Because the iterator will remain unmodified until
   * the actual execution happens, the type of data returned by the iterator must be given
   * explicitly in the form of the type information. This method is useful for cases where the type
   * is generic. In that case, the type class (as given in {@link #fromCollection(Iterator, Class)}
   * does not supply all type information.
   * <p>
   * The iterator must be serializable (as defined in {@link java.io.Serializable}), because the
   * framework may move it to a remote environment, if needed.
   * <p>
   * Note that this operation will result in a non-parallel data source, i.e. a data source with
   * a degree of parallelism of one.
   *
   * @param data The collection of elements to create the data set from.
   * @param type The TypeInformation for the produced data set.
   * @return A DataSet representing the elements in the iterator.
   *
   * @see #fromCollection(Iterator, Class)
   */
  public <X> DataSource<X> fromCollection(Iterator<X> data, TypeInformation<X> type) {
    if (!(data instanceof Serializable)) {
      throw new IllegalArgumentException("The iterator must be serializable.");
    }
   
    return new DataSource<X>(this, new IteratorInputFormat<X>(data), type);
  }
 
 
  /**
   * Creates a new data set that contains the given elements. The elements must all be of the same type,
   * for example, all of the {@link String} or {@link Integer}. The sequence of elements must not be empty.
   * Furthermore, the elements must be serializable (as defined in {@link java.io.Serializable}, because the
   * execution environment may ship the elements into the cluster.
   * <p>
   * The framework will try and determine the exact type from the collection elements.
   * In case of generic elements, it may be necessary to manually supply the type information
   * via {@link #fromCollection(Collection, TypeInformation)}.
   * <p>
   * Note that this operation will result in a non-parallel data source, i.e. a data source with
   * a degree of parallelism of one.
   *
   * @param data The elements to make up the data set.
   * @return A DataSet representing the given list of elements.
   */
  public <X> DataSource<X> fromElements(X... data) {
    if (data == null) {
      throw new IllegalArgumentException("The data must not be null.");
    }
    if (data.length == 0) {
      throw new IllegalArgumentException("The number of elements must not be zero.");
    }
   
    return fromCollection(Arrays.asList(data), TypeExtractor.getForObject(data[0]));
  }
 
 
  /**
   * Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the
   * framework to create a parallel data source that returns the elements in the iterator.
   * The iterator must be serializable (as defined in {@link java.io.Serializable}, because the
   * execution environment may ship the elements into the cluster.
   * <p>
   * Because the iterator will remain unmodified until the actual execution happens, the type of data
   * returned by the iterator must be given explicitly in the form of the type class (this is due to the
   * fact that the Java compiler erases the generic type information).
   *
   * @param iterator The iterator that produces the elements of the data set.
   * @param type The class of the data produced by the iterator. Must not be a generic class.
   * @return A DataSet representing the elements in the iterator.
   *
   * @see #fromParallelCollection(SplittableIterator, TypeInformation)
   */
  public <X> DataSource<X> fromParallelCollection(SplittableIterator<X> iterator, Class<X> type) {
    return fromParallelCollection(iterator, TypeExtractor.getForClass(type));
  }
 
  /**
   * Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the
   * framework to create a parallel data source that returns the elements in the iterator.
   * The iterator must be serializable (as defined in {@link java.io.Serializable}, because the
   * execution environment may ship the elements into the cluster.
   * <p>
   * Because the iterator will remain unmodified until the actual execution happens, the type of data
   * returned by the iterator must be given explicitly in the form of the type information.
   * This method is useful for cases where the type is generic. In that case, the type class
   * (as given in {@link #fromParallelCollection(SplittableIterator, Class)} does not supply all type information.
   *
   * @param iterator The iterator that produces the elements of the data set.
   * @param type The TypeInformation for the produced data set.
   * @return A DataSet representing the elements in the iterator.
   *
   * @see #fromParallelCollection(SplittableIterator, Class)
   */
  public <X> DataSource<X> fromParallelCollection(SplittableIterator<X> iterator, TypeInformation<X> type) {
    return new DataSource<X>(this, new ParallelIteratorInputFormat<X>(iterator), type);
  }
 
  /**
   * Creates a new data set that contains a sequence of numbers. The data set will be created in parallel,
   * so there is no guarantee about the oder of the elements.
   *
   * @param from The number to start at (inclusive).
   * @param to The number to stop at (inclusive).
   * @return A DataSet, containing all number in the {@code [from, to]} interval.
   */
  public DataSource<Long> generateSequence(long from, long to) {
    return fromParallelCollection(new NumberSequenceIterator(from, to), BasicTypeInfo.LONG_TYPE_INFO);
 
 
  // --------------------------------------------------------------------------------------------
  //  Executing
  // --------------------------------------------------------------------------------------------
 
  /**
   * Triggers the program execution. The environment will execute all parts of the program that have
   * resulted in a "sink" operation. Sink operations are for example printing results ({@link DataSet#print()},
   * writing results (e.g. {@link DataSet#writeAsText(String)},
   * {@link DataSet#write(eu.stratosphere.api.common.io.FileOutputFormat, String)}, or other generic
   * data sinks created with {@link DataSet#output(eu.stratosphere.api.common.io.OutputFormat)}.
   * <p>
   * The program execution will be logged and displayed with a generated default name.
   *
   * @return The result of the job execution, containing elapsed time and accumulators.
   * @throws Exception Thrown, if the program executions fails.
   */
  public JobExecutionResult execute() throws Exception {
    return execute(getDefaultName());
  }
 
  /**
   * Triggers the program execution. The environment will execute all parts of the program that have
   * resulted in a "sink" operation. Sink operations are for example printing results ({@link DataSet#print()},
   * writing results (e.g. {@link DataSet#writeAsText(String)},
   * {@link DataSet#write(eu.stratosphere.api.common.io.FileOutputFormat, String)}, or other generic
   * data sinks created with {@link DataSet#output(eu.stratosphere.api.common.io.OutputFormat)}.
   * <p>
   * The program execution will be logged and displayed with the given job name.
   *
   * @return The result of the job execution, containing elapsed time and accumulators.
   * @throws Exception Thrown, if the program executions fails.
   */
  public abstract JobExecutionResult execute(String jobName) throws Exception;
 
  /**
   * Creates the plan with which the system will execute the program, and returns it as
   * a String using a JSON representation of the execution data flow graph.
   *
   * @return The execution plan of the program, as a JSON String.
   * @throws Exception Thrown, if the compiler could not be instantiated, or the master could not
   *                   be contacted to retrieve information relevant to the execution planning.
   */
  public abstract String getExecutionPlan() throws Exception;
 
  /**
   * Registers a file at the distributed cache under the given name. The file will be accessible
   * from any user-defined function in the (distributed) runtime under a local path. Files
   * may be local files, or files in a distributed file system. The runtime will copy the files
   * temporarily to a local cache, if needed.
   * <p>
   * The {@link eu.stratosphere.api.common.functions.RuntimeContext} can be obtained inside UDFs via
   * {@link eu.stratosphere.api.common.functions.Function#getRuntimeContext()} and provides access
   * {@link eu.stratosphere.api.common.cache.DistributedCache} via
   * {@link eu.stratosphere.api.common.functions.RuntimeContext#getDistributedCache()}.
   *
   * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path")
   * @param name The name under which the file is registered.
   */
  public void registerCachedFile(String filePath, String name){
    this.cacheFile.add(new Tuple2<String, String>(filePath, name));
  }
 
  /**
   * Registers all files that were registered at this execution environment's cache registry of the
   * given plan's cache registry.
   *
   * @param p The plan to register files at.
   * @throws IOException Thrown if checks for existence and sanity fail.
   */
  protected void registerCachedFilesWithPlan(Plan p) throws IOException {
    for (Tuple2<String, String> entry : cacheFile) {
      p.registerCachedFile(entry.f0, entry.f1);
    }
  }
 
  /**
   * Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks,
   * and operations and how they interact, as an isolated unit that can be executed with a
   * {@link eu.stratosphere.api.common.PlanExecutor}. Obtaining a plan and starting it with an
   * executor is an alternative way to run a program and is only possible if the program consists
   * only of distributed operations.
   *
   * @return The program's plan.
   */
  public JavaPlan createProgramPlan() {
    return createProgramPlan(null);
  }
 
  /**
   * Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks,
   * and operations and how they interact, as an isolated unit that can be executed with a
   * {@link eu.stratosphere.api.common.PlanExecutor}. Obtaining a plan and starting it with an
   * executor is an alternative way to run a program and is only possible if the program consists
   * only of distributed operations.
   *
   * @param jobName The name attached to the plan (displayed in logs and monitoring).
   * @return The program's plan.
   */
  public JavaPlan createProgramPlan(String jobName) {
    if (this.sinks.isEmpty()) {
      throw new RuntimeException("No data sinks have been created yet. A program needs at least one sink that consumes data. Examples are writing the data set or printing it.");
    }
   
    if (jobName == null) {
      jobName = getDefaultName();
    }
   
    OperatorTranslation translator = new OperatorTranslation();
    JavaPlan plan = translator.translateToPlan(this.sinks, jobName);
   
    if (getDegreeOfParallelism() > 0) {
      plan.setDefaultParallelism(getDegreeOfParallelism());
    }
   
    try {
      registerCachedFilesWithPlan(plan);
    } catch (Exception e) {
      throw new RuntimeException("Error while registering cached files: " + e.getMessage(), e);
    }
   
    return plan;
  }
 
  /**
   * Adds the given sink to this environment. Only sinks that have been added will be executed once
   * the {@link #execute()} or {@link #execute(String)} method is called.
   *
   * @param sink The sink to add for execution.
   */
  void registerDataSink(DataSink<?> sink) {
    this.sinks.add(sink);
  }
 
  /**
   * Gets a default job name, based on the timestamp when this method is invoked.
   *
   * @return A default job name.
   */
  private static String getDefaultName() {
    return "Stratosphere Java Job at " + Calendar.getInstance().getTime();
  }
 
  // --------------------------------------------------------------------------------------------
  //  Instantiation of Execution Contexts
  // --------------------------------------------------------------------------------------------
 
  /**
   * Creates an execution environment that represents the context in which the program is currently executed.
   * If the program is invoked standalone, this method returns a local execution environment, as returned by
   * {@link #createLocalEnvironment()}. If the program is invoked from within the command line client to be
   * submitted to a cluster, this method returns the execution environment of this cluster.
   *
   * @return The execution environment of the context in which the program is executed.
   */
  public static ExecutionEnvironment getExecutionEnvironment() {
    return contextEnvironment == null ? createLocalEnvironment() : contextEnvironment;
  }
 
  /**
   * Creates a {@link LocalEnvironment}. The local execution environment will run the program in a
   * multi-threaded fashion in the same JVM as the environment was created in. The default degree of
   * parallelism of the local environment is the number of hardware contexts (CPU cores / threads),
   * unless it was specified differently by {@link #setDefaultLocalParallelism(int)}.
   *
   * @return A local execution environment.
   */
  public static LocalEnvironment createLocalEnvironment() {
    return createLocalEnvironment(defaultLocalDop);
  }
 
  /**
   * Creates a {@link LocalEnvironment}. The local execution environment will run the program in a
   * multi-threaded fashion in the same JVM as the environment was created in. It will use the
   * degree of parallelism specified in the parameter.
   *
   * @param degreeOfParallelism The degree of parallelism for the local environment.
   * @return A local execution environment with the specified degree of parallelism.
   */
  public static LocalEnvironment createLocalEnvironment(int degreeOfParallelism) {
    LocalEnvironment lee = new LocalEnvironment();
    lee.setDegreeOfParallelism(degreeOfParallelism);
    return lee;
  }
 
  /**
   * Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program
   * to a cluster for execution. Note that all file paths used in the program must be accessible from the
   * cluster. The execution will use the cluster's default degree of parallelism, unless the parallelism is
   * set explicitly via {@link ExecutionEnvironment#setDegreeOfParallelism(int)}.
   *
   * @param host The host name or address of the master (JobManager), where the program should be executed.
   * @param port The port of the master (JobManager), where the program should be executed.
   * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses
   *                 user-defined functions, user-defined input formats, or any libraries, those must be
   *                 provided in the JAR files.
   * @return A remote environment that executes the program on a cluster.
   */
  public static ExecutionEnvironment createRemoteEnvironment(String host, int port, String... jarFiles) {
    return new RemoteEnvironment(host, port, jarFiles);
  }

  /**
   * Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program
   * to a cluster for execution. Note that all file paths used in the program must be accessible from the
   * cluster. The execution will use the specified degree of parallelism.
   *
   * @param host The host name or address of the master (JobManager), where the program should be executed.
   * @param port The port of the master (JobManager), where the program should be executed.
   * @param degreeOfParallelism The degree of parallelism to use during the execution.
   * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses
   *                 user-defined functions, user-defined input formats, or any libraries, those must be
   *                 provided in the JAR files.
   * @return A remote environment that executes the program on a cluster.
   */
  public static ExecutionEnvironment createRemoteEnvironment(String host, int port, int degreeOfParallelism, String... jarFiles) {
    RemoteEnvironment rec = new RemoteEnvironment(host, port, jarFiles);
    rec.setDegreeOfParallelism(degreeOfParallelism);
    return rec;
  }
 
  /**
   * Sets the default parallelism that will be used for the local execution environment created by
   * {@link #createLocalEnvironment()}.
   *
   * @param degreeOfParallelism The degree of parallelism to use as the default local parallelism.
   */
  public static void setDefaultLocalParallelism(int degreeOfParallelism) {
    defaultLocalDop = degreeOfParallelism;
  }
 
  // --------------------------------------------------------------------------------------------
  //  Methods to control the context and local environments for execution from packaged programs
  // --------------------------------------------------------------------------------------------
 
  protected static void initializeContextEnvironment(ExecutionEnvironment ctx) {
    contextEnvironment = ctx;
  }
 
  protected static boolean isContextEnvironmentSet() {
    return contextEnvironment != null;
  }
 
  protected static void disableLocalExecution() {
    allowLocalExecution = false;
  }
 
  public static boolean localExecutionIsAllowed() {
    return allowLocalExecution;
  }
}
TOP

Related Classes of eu.stratosphere.api.java.ExecutionEnvironment

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.