Source Code of cascading.flow.hadoop.HadoopFlowProcess

/*
 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package cascading.flow.hadoop;


import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


import cascading.CascadingException;
import cascading.flow.FlowProcess;
import cascading.flow.FlowSession;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;


/**
 * Class HadoopFlowProcess is an implementation of {@link FlowProcess} for Hadoop. Use this interface to get direct
 * access to the Hadoop JobConf and Reporter interfaces.
 * <p/>
 * Be warned that coupling to this implementation will cause custom {@link cascading.operation.Operation}s to
 * fail if they are executed on a system other than Hadoop.
 *
 * @see cascading.flow.FlowSession
 * @see JobConf
 * @see Reporter
 */
public class HadoopFlowProcess extends FlowProcess<JobConf>
  {
  /** Field jobConf */
  final JobConf jobConf;
  /** Field isMapper */
  private final boolean isMapper;
  /** Field reporter */
  Reporter reporter = Reporter.NULL;
  private OutputCollector outputCollector;


  public HadoopFlowProcess()
    {
    this.jobConf = new JobConf();
    this.isMapper = true;
    }


  public HadoopFlowProcess( Configuration jobConf )
    {
    this( new JobConf( jobConf ) );
    }


  public HadoopFlowProcess( JobConf jobConf )
    {
    this.jobConf = jobConf;
    this.isMapper = true;
    }


  public HadoopFlowProcess( FlowSession flowSession, JobConf jobConf )
    {
    super( flowSession );
    this.jobConf = jobConf;
    this.isMapper = true;
    }


  /**
   * Constructor HadoopFlowProcess creates a new HadoopFlowProcess instance.
   *
   * @param flowSession of type FlowSession
   * @param jobConf     of type JobConf
   */
  public HadoopFlowProcess( FlowSession flowSession, JobConf jobConf, boolean isMapper )
    {
    super( flowSession );
    this.jobConf = jobConf;
    this.isMapper = isMapper;
    }


  public HadoopFlowProcess( HadoopFlowProcess flowProcess, JobConf jobConf )
    {
    super( flowProcess.getCurrentSession() );
    this.jobConf = jobConf;
    this.isMapper = flowProcess.isMapper();
    this.reporter = flowProcess.getReporter();
    }


  @Override
  public FlowProcess copyWith( JobConf jobConf )
    {
    return new HadoopFlowProcess( this, jobConf );
    }


  /**
   * Method getJobConf returns the jobConf of this HadoopFlowProcess object.
   *
   * @return the jobConf (type JobConf) of this HadoopFlowProcess object.
   */
  public JobConf getJobConf()
    {
    return jobConf;
    }


  @Override
  public JobConf getConfigCopy()
    {
    return HadoopUtil.copyJobConf( jobConf );
    }


  /**
   * Method isMapper returns true if this part of the FlowProcess is a MapReduce mapper. If false, it is a reducer.
   *
   * @return boolean
   */
  public boolean isMapper()
    {
    return isMapper;
    }


  public int getCurrentNumMappers()
    {
    return getJobConf().getNumMapTasks();
    }


  public int getCurrentNumReducers()
    {
    return getJobConf().getNumReduceTasks();
    }


  /**
   * Method getCurrentTaskNum returns the task number of this task. Task 0 is the first task.
   *
   * @return int
   */
  @Override
  public int getCurrentSliceNum()
    {
    return getJobConf().getInt( "mapred.task.partition", 0 );
    }


  @Override
  public int getNumProcessSlices()
    {
    if( isMapper() )
      return getCurrentNumMappers();
    else
      return getCurrentNumReducers();
    }


  /**
   * Method setReporter sets the reporter of this HadoopFlowProcess object.
   *
   * @param reporter the reporter of this HadoopFlowProcess object.
   */
  public void setReporter( Reporter reporter )
    {
    this.reporter = reporter;
    }


  /**
   * Method getReporter returns the reporter of this HadoopFlowProcess object.
   *
   * @return the reporter (type Reporter) of this HadoopFlowProcess object.
   */
  public Reporter getReporter()
    {
    return reporter;
    }


  public void setOutputCollector( OutputCollector outputCollector )
    {
    this.outputCollector = outputCollector;
    }


  public OutputCollector getOutputCollector()
    {
    return outputCollector;
    }


  @Override
  public Object getProperty( String key )
    {
    return jobConf.get( key );
    }


  @Override
  public Collection<String> getPropertyKeys()
    {
    Set<String> keys = new HashSet<String>();


    for( Map.Entry<String, String> entry : jobConf )
      keys.add( entry.getKey() );


    return Collections.unmodifiableSet( keys );
    }


  @Override
  public Object newInstance( String className )
    {
    if( className == null || className.isEmpty() )
      return null;


    try
      {
      Class type = (Class) HadoopFlowProcess.class.getClassLoader().loadClass( className.toString() );


      return ReflectionUtils.newInstance( type, jobConf );
      }
    catch( ClassNotFoundException exception )
      {
      throw new CascadingException( "unable to load class: " + className.toString(), exception );
      }
    }


  @Override
  public void keepAlive()
    {
    getReporter().progress();
    }


  @Override
  public void increment( Enum counter, long amount )
    {
    getReporter().incrCounter( counter, amount );
    }


  @Override
  public void increment( String group, String counter, long amount )
    {
    getReporter().incrCounter( group, counter, amount );
    }


  @Override
  public void setStatus( String status )
    {
    getReporter().setStatus( status );
    }


  @Override
  public boolean isCounterStatusInitialized()
    {
    return getReporter() != null;
    }


  @Override
  public TupleEntryIterator openTapForRead( Tap tap ) throws IOException
    {
    return tap.openForRead( this );
    }


  @Override
  public TupleEntryCollector openTapForWrite( Tap tap ) throws IOException
    {
    return tap.openForWrite( this, null ); // do not honor sinkmode as this may be opened across tasks
    }


  @Override
  public TupleEntryCollector openTrapForWrite( Tap trap ) throws IOException
    {
    JobConf jobConf = HadoopUtil.copyJobConf( getJobConf() );


    int stepNum = jobConf.getInt( "cascading.flow.step.num", 0 );
    String partname;


    if( jobConf.getBoolean( "mapred.task.is.map", true ) )
      partname = String.format( "-m-%05d-", stepNum );
    else
      partname = String.format( "-r-%05d-", stepNum );


    jobConf.set( "cascading.tapcollector.partname", "%s%spart" + partname + "%05d" );


    return trap.openForWrite( new HadoopFlowProcess( this, jobConf ), null ); // do not honor sinkmode as this may be opened across tasks
    }


  @Override
  public TupleEntryCollector openSystemIntermediateForWrite() throws IOException
    {
    return new TupleEntryCollector( Fields.size( 2 ) )
    {
    @Override
    protected void collect( TupleEntry tupleEntry )
      {
      try
        {
        getOutputCollector().collect( tupleEntry.getObject( 0 ), tupleEntry.getObject( 1 ) );
        }
      catch( IOException exception )
        {
        throw new CascadingException( "failed collecting key and value", exception );
        }
      }
    };
    }


  @Override
  public <C> C copyConfig( C config )
    {
    return HadoopUtil.copyJobConf( config );
    }


  @Override
  public <C> Map<String, String> diffConfigIntoMap( C defaultConfig, C updatedConfig )
    {
    return HadoopUtil.getConfig( (Configuration) defaultConfig, (Configuration) updatedConfig );
    }


  @Override
  public JobConf mergeMapIntoConfig( JobConf defaultConfig, Map<String, String> map )
    {
    return HadoopUtil.mergeConf( defaultConfig, map, false );
    }
  }
Source Code of cascading.flow.hadoop.HadoopFlowProcess

Related Classes of cascading.flow.hadoop.HadoopFlowProcess