Source Code of cascading.flow.hadoop.HadoopFlow

/*
 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package cascading.flow.hadoop;


import java.io.IOException;
import java.util.Map;


import cascading.flow.BaseFlow;
import cascading.flow.Flow;
import cascading.flow.FlowDef;
import cascading.flow.FlowException;
import cascading.flow.FlowProcess;
import cascading.flow.FlowStep;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.flow.planner.BaseFlowStep;
import cascading.flow.planner.PlatformInfo;
import cascading.property.PropertyUtil;
import cascading.tap.hadoop.io.HttpFileSystem;
import cascading.util.ShutdownUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;


import static cascading.flow.FlowProps.MAX_CONCURRENT_STEPS;
import static cascading.flow.FlowProps.PRESERVE_TEMPORARY_FILES;


/**
 * Class HadoopFlow is the Apache Hadoop specific implementation of a {@link Flow}.
 * <p/>
 * HadoopFlow must be created through a {@link HadoopFlowConnector} instance.
 * <p/>
 * If classpath paths are provided on the {@link FlowDef}, the Hadoop distributed cache mechanism will be used
 * to augment the remote classpath.
 * <p/>
 * Any path elements that are relative will be uploaded to HDFS, and the HDFS URI will be used on the JobConf. Note
 * all paths are added as "files" to the JobConf, not archives, so they aren't needlessly uncompressed cluster side.
 *
 * @see HadoopFlowConnector
 */
public class HadoopFlow extends BaseFlow<JobConf>
  {
  /** Field hdfsShutdown */
  private static Thread hdfsShutdown = null;
  /** Field shutdownHook */
  private static ShutdownUtil.Hook shutdownHook;
  /** Field jobConf */
  private transient JobConf jobConf;
  /** Field preserveTemporaryFiles */
  private boolean preserveTemporaryFiles = false;
  /** Field syncPaths */
  private transient Map<Path, Path> syncPaths;


  protected HadoopFlow()
    {
    }


  /**
   * Returns property preserveTemporaryFiles.
   *
   * @param properties of type Map
   * @return a boolean
   */
  static boolean getPreserveTemporaryFiles( Map<Object, Object> properties )
    {
    return Boolean.parseBoolean( PropertyUtil.getProperty( properties, PRESERVE_TEMPORARY_FILES, "false" ) );
    }


  static int getMaxConcurrentSteps( JobConf jobConf )
    {
    return jobConf.getInt( MAX_CONCURRENT_STEPS, 0 );
    }


  protected HadoopFlow( PlatformInfo platformInfo, Map<Object, Object> properties, JobConf jobConf, String name, Map<String, String> flowDescriptor )
    {
    super( platformInfo, properties, jobConf, name, flowDescriptor );
    initFromProperties( properties );
    }


  public HadoopFlow( PlatformInfo platformInfo, Map<Object, Object> properties, JobConf jobConf, FlowDef flowDef )
    {
    super( platformInfo, properties, jobConf, flowDef );


    initFromProperties( properties );
    }


  @Override
  protected void initFromProperties( Map<Object, Object> properties )
    {
    super.initFromProperties( properties );
    preserveTemporaryFiles = getPreserveTemporaryFiles( properties );
    }


  protected void initConfig( Map<Object, Object> properties, JobConf parentConfig )
    {
    if( properties != null )
      parentConfig = createConfig( properties, parentConfig );


    if( parentConfig == null ) // this is ok, getJobConf will pass a default parent in
      return;


    jobConf = HadoopUtil.copyJobConf( parentConfig ); // prevent local values from being shared
    jobConf.set( "fs.http.impl", HttpFileSystem.class.getName() );
    jobConf.set( "fs.https.impl", HttpFileSystem.class.getName() );


    syncPaths = HadoopUtil.addToClassPath( jobConf, getClassPath() );
    }


  @Override
  protected void setConfigProperty( JobConf config, Object key, Object value )
    {
    // don't let these objects pass, even though toString is called below.
    if( value instanceof Class || value instanceof JobConf )
      return;


    config.set( key.toString(), value.toString() );
    }


  @Override
  protected JobConf newConfig( JobConf defaultConfig )
    {
    return defaultConfig == null ? new JobConf() : HadoopUtil.copyJobConf( defaultConfig );
    }


  @Override
  public JobConf getConfig()
    {
    if( jobConf == null )
      initConfig( null, new JobConf() );


    return jobConf;
    }


  @Override
  public JobConf getConfigCopy()
    {
    return HadoopUtil.copyJobConf( getConfig() );
    }


  @Override
  public Map<Object, Object> getConfigAsProperties()
    {
    return HadoopUtil.createProperties( getConfig() );
    }


  /**
   * Method getProperty returns the value associated with the given key from the underlying properties system.
   *
   * @param key of type String
   * @return String
   */
  public String getProperty( String key )
    {
    return getConfig().get( key );
    }


  @Override
  public FlowProcess<JobConf> getFlowProcess()
    {
    return new HadoopFlowProcess( getFlowSession(), getConfig() );
    }


  /**
   * Method isPreserveTemporaryFiles returns false if temporary files will be cleaned when this Flow completes.
   *
   * @return the preserveTemporaryFiles (type boolean) of this Flow object.
   */
  public boolean isPreserveTemporaryFiles()
    {
    return preserveTemporaryFiles;
    }


  @Override
  protected void internalStart()
    {
    try
      {
      copyToDistributedCache();
      deleteSinksIfReplace();
      deleteTrapsIfReplace();
      deleteCheckpointsIfReplace();
      }
    catch( IOException exception )
      {
      throw new FlowException( "unable to delete sinks", exception );
      }


    registerHadoopShutdownHook( this );
    }


  private void copyToDistributedCache()
    {
    HadoopUtil.syncPaths( jobConf, syncPaths, true );
    }


  @Override
  public boolean stepsAreLocal()
    {
    return HadoopUtil.isLocal( getConfig() );
    }


  private void cleanTemporaryFiles( boolean stop )
    {
    if( stop ) // unstable to call fs operations during shutdown
      return;


    // use step config so cascading.flow.step.path property is properly used
    for( FlowStep<JobConf> step : getFlowSteps() )
      ( (BaseFlowStep<JobConf>) step ).clean();
    }


  private static synchronized void registerHadoopShutdownHook( Flow flow )
    {
    if( !flow.isStopJobsOnExit() )
      return;


    // guaranteed singleton here
    if( shutdownHook != null )
      return;


    getHdfsShutdownHook();


    shutdownHook = new ShutdownUtil.Hook()
    {
    @Override
    public Priority priority()
      {
      return Priority.LAST; // very last thing to happen
      }


    @Override
    public void execute()
      {
      callHdfsShutdownHook();
      }
    };


    ShutdownUtil.addHook( shutdownHook );
    }


  private synchronized static void callHdfsShutdownHook()
    {
    if( hdfsShutdown != null )
      hdfsShutdown.start();
    }


  private synchronized static void getHdfsShutdownHook()
    {
    if( hdfsShutdown == null )
      hdfsShutdown = HadoopUtil.getHDFSShutdownHook();
    }


  protected void internalClean( boolean stop )
    {
    if( !isPreserveTemporaryFiles() )
      cleanTemporaryFiles( stop );
    }


  protected void internalShutdown()
    {
    }


  protected int getMaxNumParallelSteps()
    {
    return stepsAreLocal() ? 1 : getMaxConcurrentSteps( getConfig() );
    }


  @Override
  protected long getTotalSliceCPUMilliSeconds()
    {
    // this is a hadoop2 MR specific counter/value
    long counterValue = flowStats.getCounterValue( "org.apache.hadoop.mapreduce.TaskCounter", "CPU_MILLISECONDS" );


    if( counterValue == 0 )
      return -1;


    return counterValue;
    }
  }
Source Code of cascading.flow.hadoop.HadoopFlow

Related Classes of cascading.flow.hadoop.HadoopFlow