Source Code of cascading.tap.hadoop.Hfs$CombinedInputFormat

/*
 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package cascading.tap.hadoop;


import java.beans.ConstructorProperties;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;


import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.TapException;
import cascading.tap.hadoop.io.CombineFileRecordReaderWrapper;
import cascading.tap.hadoop.io.HadoopTupleEntrySchemeCollector;
import cascading.tap.hadoop.io.HadoopTupleEntrySchemeIterator;
import cascading.tap.type.FileType;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.util.Util;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3native.NativeS3FileSystem;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Utils;
import org.apache.hadoop.mapred.lib.CombineFileInputFormat;
import org.apache.hadoop.mapred.lib.CombineFileRecordReader;
import org.apache.hadoop.mapred.lib.CombineFileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Class Hfs is the base class for all Hadoop file system access. Hfs may only be used with the
 * {@link cascading.flow.hadoop.HadoopFlowConnector} when creating Hadoop executable {@link cascading.flow.Flow}
 * instances.
 * <p/>
 * Paths typically should point to a directory, where in turn all the "part" files immediately in that directory will
 * be included. This is the practice Hadoop expects. Sub-directories are not included and typically result in a failure.
 * <p/>
 * To include sub-directories, Hadoop supports "globing". Globing is a frustrating feature and is supported more
 * robustly by {@link GlobHfs} and less so by Hfs.
 * <p/>
 * Hfs will accept {@code /*} (wildcard) paths, but not all convenience methods like
 * {@code jobConf.getSize} will behave properly or reliably. Nor can the Hfs instance
 * with a wildcard path be used as a sink to write data.
 * <p/>
 * In those cases use GlobHfs since it is a sub-class of {@link cascading.tap.MultiSourceTap}.
 * <p/>
 * Optionally use {@link Dfs} or {@link Lfs} for resources specific to Hadoop Distributed file system or
 * the Local file system, respectively. Using Hfs is the best practice when possible, Lfs and Dfs are conveniences.
 * <p/>
 * Use the Hfs class if the 'kind' of resource is unknown at design time. To use, prefix a scheme to the 'stringPath'. Where
 * <code>hdfs://...</code> will denote Dfs, and <code>file://...</code> will denote Lfs.
 * <p/>
 * Call {@link #setTemporaryDirectory(java.util.Map, String)} to use a different temporary file directory path
 * other than the current Hadoop default path.
 * <p/>
 * By default Cascading on Hadoop will assume any source or sink Tap using the {@code file://} URI scheme
 * intends to read files from the local client filesystem (for example when using the {@code Lfs} Tap) where the Hadoop
 * job jar is started, Tap so will force any MapReduce jobs reading or writing to {@code file://} resources to run in
 * Hadoop "standalone mode" so that the file can be read.
 * <p/>
 * To change this behavior, {@link HfsProps#setLocalModeScheme(java.util.Map, String)} to set a different scheme value,
 * or to "none" to disable entirely for the case the file to be read is available on every Hadoop processing node
 * in the exact same path.
 * <p/>
 * Hfs can optionally combine multiple small files (or a series of small "blocks") into larger "splits". This reduces
 * the number of resulting map tasks created by Hadoop and can improve application performance.
 * <p/>
 * This is enabled by calling {@link HfsProps#setUseCombinedInput(boolean)} to {@code true}. By default, merging
 * or combining splits into large ones is disabled.
 */
public class Hfs extends Tap<Configuration, RecordReader, OutputCollector> implements FileType<Configuration>
  {
  /** Field LOG */
  private static final Logger LOG = LoggerFactory.getLogger( Hfs.class );


  /**
   * Field TEMPORARY_DIRECTORY
   *
   * @deprecated see {@link HfsProps#TEMPORARY_DIRECTORY}
   */
  @Deprecated
  public static final String TEMPORARY_DIRECTORY = HfsProps.TEMPORARY_DIRECTORY;


  /** Field stringPath */
  protected String stringPath;
  /** Field uriScheme */
  transient URI uriScheme;
  /** Field path */
  transient Path path;
  /** Field paths */
  private transient FileStatus[] statuses; // only used by getModifiedTime


  private transient String cachedPath = null;


  /**
   * Method setTemporaryDirectory sets the temporary directory on the given properties object.
   *
   * @param properties of type Map<Object,Object>
   * @param tempDir    of type String
   * @deprecated see {@link HfsProps}
   */
  @Deprecated
  public static void setTemporaryDirectory( Map<Object, Object> properties, String tempDir )
    {
    properties.put( HfsProps.TEMPORARY_DIRECTORY, tempDir );
    }


  /**
   * Method getTemporaryDirectory returns the configured temporary directory from the given properties object.
   *
   * @param properties of type Map<Object,Object>
   * @return a String or null if not set
   * @deprecated see {@link HfsProps}
   */
  @Deprecated
  public static String getTemporaryDirectory( Map<Object, Object> properties )
    {
    return (String) properties.get( HfsProps.TEMPORARY_DIRECTORY );
    }


  protected static String getLocalModeScheme( Configuration conf, String defaultValue )
    {
    return conf.get( HfsProps.LOCAL_MODE_SCHEME, defaultValue );
    }


  protected static boolean getUseCombinedInput( Configuration conf )
    {
    return conf.getBoolean( HfsProps.COMBINE_INPUT_FILES, false );
    }


  protected static boolean getCombinedInputSafeMode( Configuration conf )
    {
    return conf.getBoolean( HfsProps.COMBINE_INPUT_FILES_SAFE_MODE, true );
    }


  protected Hfs()
    {
    }


  @ConstructorProperties({"scheme"})
  protected Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme )
    {
    super( scheme );
    }


  /**
   * Constructor Hfs creates a new Hfs instance.
   *
   * @param scheme     of type Scheme
   * @param stringPath of type String
   */
  @ConstructorProperties({"scheme", "stringPath"})
  public Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath )
    {
    super( scheme );
    setStringPath( stringPath );
    }


  /**
   * Constructor Hfs creates a new Hfs instance.
   *
   * @param scheme     of type Scheme
   * @param stringPath of type String
   * @param replace    of type boolean
   */
  @Deprecated
  @ConstructorProperties({"scheme", "stringPath", "replace"})
  public Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath, boolean replace )
    {
    super( scheme, replace ? SinkMode.REPLACE : SinkMode.KEEP );
    setStringPath( stringPath );
    }


  /**
   * Constructor Hfs creates a new Hfs instance.
   *
   * @param scheme     of type Scheme
   * @param stringPath of type String
   * @param sinkMode   of type SinkMode
   */
  @ConstructorProperties({"scheme", "stringPath", "sinkMode"})
  public Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath, SinkMode sinkMode )
    {
    super( scheme, sinkMode );
    setStringPath( stringPath );
    }


  protected void setStringPath( String stringPath )
    {
    this.stringPath = Util.normalizeUrl( stringPath );
    }


  protected void setUriScheme( URI uriScheme )
    {
    this.uriScheme = uriScheme;
    }


  public URI getURIScheme( Configuration jobConf )
    {
    if( uriScheme != null )
      return uriScheme;


    uriScheme = makeURIScheme( jobConf );


    return uriScheme;
    }


  protected URI makeURIScheme( Configuration configuration )
    {
    try
      {
      URI uriScheme;


      LOG.debug( "handling path: {}", stringPath );


      URI uri = new Path( stringPath ).toUri(); // safer URI parsing
      String schemeString = uri.getScheme();
      String authority = uri.getAuthority();


      LOG.debug( "found scheme: {}, authority: {}", schemeString, authority );


      if( schemeString != null && authority != null )
        uriScheme = new URI( schemeString + "://" + uri.getAuthority() );
      else if( schemeString != null )
        uriScheme = new URI( schemeString + ":///" );
      else
        uriScheme = getDefaultFileSystemURIScheme( configuration );


      LOG.debug( "using uri scheme: {}", uriScheme );


      return uriScheme;
      }
    catch( URISyntaxException exception )
      {
      throw new TapException( "could not determine scheme from path: " + getPath(), exception );
      }
    }


  /**
   * Method getDefaultFileSystemURIScheme returns the URI scheme for the default Hadoop FileSystem.
   *
   * @param configuration of type JobConf
   * @return URI
   */
  public URI getDefaultFileSystemURIScheme( Configuration configuration )
    {
    return getDefaultFileSystem( configuration ).getUri();
    }


  protected FileSystem getDefaultFileSystem( Configuration configuration )
    {
    try
      {
      return FileSystem.get( configuration );
      }
    catch( IOException exception )
      {
      throw new TapException( "unable to get handle to underlying filesystem", exception );
      }
    }


  protected FileSystem getFileSystem( Configuration configuration )
    {
    URI scheme = getURIScheme( configuration );


    try
      {
      return FileSystem.get( scheme, configuration );
      }
    catch( IOException exception )
      {
      throw new TapException( "unable to get handle to get filesystem for: " + scheme.getScheme(), exception );
      }
    }


  @Override
  public String getIdentifier()
    {
    if( cachedPath == null )
      cachedPath = getPath().toString();


    return cachedPath;
    }


  public Path getPath()
    {
    if( path != null )
      return path;


    if( stringPath == null )
      throw new IllegalStateException( "path not initialized" );


    path = new Path( stringPath );


    return path;
    }


  @Override
  public String getFullIdentifier( Configuration conf )
    {
    return getPath().makeQualified( getFileSystem( conf ) ).toString();
    }


  @Override
  public void sourceConfInit( FlowProcess<? extends Configuration> process, Configuration conf )
    {
    String fullIdentifier = getFullIdentifier( conf );


    applySourceConfInitIdentifiers( process, conf, fullIdentifier );


    verifyNoDuplicates( conf );
    }


  protected static void verifyNoDuplicates( Configuration conf )
    {
    Path[] inputPaths = FileInputFormat.getInputPaths( HadoopUtil.asJobConfInstance( conf ) );
    Set<Path> paths = new HashSet<Path>( (int) ( inputPaths.length / .75f ) );


    for( Path inputPath : inputPaths )
      {
      if( !paths.add( inputPath ) )
        throw new TapException( "may not add duplicate paths, found: " + inputPath );
      }
    }


  protected void applySourceConfInitIdentifiers( FlowProcess<? extends Configuration> process, Configuration conf, String... fullIdentifiers )
    {
    for( String fullIdentifier : fullIdentifiers )
      sourceConfInitAddInputPath( conf, new Path( fullIdentifier ) );


    sourceConfInitComplete( process, conf );
    }


  protected void sourceConfInitAddInputPath( Configuration conf, Path qualifiedPath )
    {
    HadoopUtil.addInputPath( conf, qualifiedPath );


    makeLocal( conf, qualifiedPath, "forcing job to local mode, via source: " );
    }


  protected void sourceConfInitComplete( FlowProcess<? extends Configuration> process, Configuration conf )
    {
    super.sourceConfInit( process, conf );


    TupleSerialization.setSerializations( conf ); // allows Hfs to be used independent of Flow


    // use CombineFileInputFormat if that is enabled
    handleCombineFileInputFormat( conf );
    }


  /**
   * Based on the configuration, handles and sets {@link CombineFileInputFormat} as the input
   * format.
   */
  private void handleCombineFileInputFormat( Configuration conf )
    {
    // if combining files, override the configuration to use CombineFileInputFormat
    if( !getUseCombinedInput( conf ) )
      return;


    // get the prescribed individual input format from the underlying scheme so it can be used by CombinedInputFormat
    String individualInputFormat = conf.get( "mapred.input.format.class" );


    if( individualInputFormat == null )
      throw new TapException( "input format is missing from the underlying scheme" );


    if( individualInputFormat.equals( CombinedInputFormat.class.getName() ) &&
      conf.get( CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT ) == null )
      throw new TapException( "the input format class is already the combined input format but the underlying input format is missing" );


    // if safe mode is on (default) throw an exception if the InputFormat is not a FileInputFormat, otherwise log a
    // warning and don't use the CombineFileInputFormat
    boolean safeMode = getCombinedInputSafeMode( conf );


    if( !FileInputFormat.class.isAssignableFrom( conf.getClass( "mapred.input.format.class", null ) ) )
      {
      if( safeMode )
        throw new TapException( "input format must be of type org.apache.hadoop.mapred.FileInputFormat, got: " + individualInputFormat );
      else
        LOG.warn( "not combining input splits with CombineFileInputFormat, {} is not of type org.apache.hadoop.mapred.FileInputFormat.", individualInputFormat );
      }
    else
      {
      // set the underlying individual input format
      conf.set( CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT, individualInputFormat );


      // override the input format class
      conf.setClass( "mapred.input.format.class", CombinedInputFormat.class, InputFormat.class );
      }
    }


  @Override
  public void sinkConfInit( FlowProcess<? extends Configuration> process, Configuration conf )
    {
    Path qualifiedPath = new Path( getFullIdentifier( conf ) );


    HadoopUtil.setOutputPath( conf, qualifiedPath );
    super.sinkConfInit( process, conf );


    makeLocal( conf, qualifiedPath, "forcing job to local mode, via sink: " );


    TupleSerialization.setSerializations( conf ); // allows Hfs to be used independent of Flow
    }


  private void makeLocal( Configuration conf, Path qualifiedPath, String infoMessage )
    {
    String scheme = getLocalModeScheme( conf, "file" );


    if( !HadoopUtil.isLocal( conf ) && qualifiedPath.toUri().getScheme().equalsIgnoreCase( scheme ) )
      {
      if( LOG.isInfoEnabled() )
        LOG.info( infoMessage + toString() );


      HadoopUtil.setLocal( conf ); // force job to run locally
      }
    }


  @Override
  public TupleEntryIterator openForRead( FlowProcess<? extends Configuration> flowProcess, RecordReader input ) throws IOException
    {
    // input may be null when this method is called on the client side or cluster side when accumulating
    // for a HashJoin
    return new HadoopTupleEntrySchemeIterator( flowProcess, this, input );
    }


  @Override
  public TupleEntryCollector openForWrite( FlowProcess<? extends Configuration> flowProcess, OutputCollector output ) throws IOException
    {
    // output may be null when this method is called on the client side or cluster side when creating
    // side files with the TemplateTap
    return new HadoopTupleEntrySchemeCollector( flowProcess, this, output );
    }


  @Override
  public boolean createResource( Configuration conf ) throws IOException
    {
    if( LOG.isDebugEnabled() )
      LOG.debug( "making dirs: {}", getFullIdentifier( conf ) );


    return getFileSystem( conf ).mkdirs( getPath() );
    }


  @Override
  public boolean deleteResource( Configuration conf ) throws IOException
    {
    String fullIdentifier = getFullIdentifier( conf );


    return deleteFullIdentifier( conf, fullIdentifier );
    }


  private boolean deleteFullIdentifier( Configuration conf, String fullIdentifier ) throws IOException
    {
    if( LOG.isDebugEnabled() )
      LOG.debug( "deleting: {}", fullIdentifier );


    Path fullPath = new Path( fullIdentifier );


    // do not delete the root directory
    if( fullPath.depth() == 0 )
      return true;


    FileSystem fileSystem = getFileSystem( conf );


    try
      {
      return fileSystem.delete( fullPath, true );
      }
    catch( NullPointerException exception )
      {
      // hack to get around npe thrown when fs reaches root directory
      if( !( fileSystem instanceof NativeS3FileSystem ) )
        throw exception;
      }


    return true;
    }


  public boolean deleteChildResource( Configuration conf, String childIdentifier ) throws IOException
    {
    Path childPath = new Path( childIdentifier ).makeQualified( getFileSystem( conf ) );


    if( !childPath.toString().startsWith( getFullIdentifier( conf ) ) )
      return false;


    return deleteFullIdentifier( conf, childPath.toString() );
    }


  @Override
  public boolean resourceExists( Configuration conf ) throws IOException
    {
    // unfortunately getFileSystem( conf ).exists( getPath() ); does not account for "/*" etc
    // nor is there an more efficient means to test for existence
    FileStatus[] fileStatuses = getFileSystem( conf ).globStatus( getPath() );


    return fileStatuses != null && fileStatuses.length > 0;
    }


  @Override
  public boolean isDirectory( Configuration conf ) throws IOException
    {
    if( !resourceExists( conf ) )
      {
      return false;
      }


    return getFileSystem( conf ).getFileStatus( getPath() ).isDir();
    }


  @Override
  public long getSize( Configuration conf ) throws IOException
    {
    if( !resourceExists( conf ) )
      return 0;


    FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );


    if( fileStatus.isDir() )
      return 0;


    return getFileSystem( conf ).getFileStatus( getPath() ).getLen();
    }


  /**
   * Method getBlockSize returns the {@code blocksize} specified by the underlying file system for this resource.
   *
   * @param conf of JobConf
   * @return long
   * @throws IOException when
   */
  public long getBlockSize( Configuration conf ) throws IOException
    {
    if( !resourceExists( conf ) )
      return 0;


    FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );


    if( fileStatus.isDir() )
      return 0;


    return fileStatus.getBlockSize();
    }


  /**
   * Method getReplication returns the {@code replication} specified by the underlying file system for
   * this resource.
   *
   * @param conf of JobConf
   * @return int
   * @throws IOException when
   */
  public int getReplication( Configuration conf ) throws IOException
    {
    if( !resourceExists( conf ) )
      return 0;


    FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );


    if( fileStatus.isDir() )
      return 0;


    return fileStatus.getReplication();
    }


  @Override
  public String[] getChildIdentifiers( Configuration conf ) throws IOException
    {
    return getChildIdentifiers( conf, 1, false );
    }


  @Override
  public String[] getChildIdentifiers( Configuration conf, int depth, boolean fullyQualified ) throws IOException
    {
    if( !resourceExists( conf ) )
      return new String[ 0 ];


    if( depth == 0 && !fullyQualified )
      return new String[]{getIdentifier()};


    String fullIdentifier = getFullIdentifier( conf );


    int trim = fullyQualified ? 0 : fullIdentifier.length() + 1;


    Set<String> results = new LinkedHashSet<String>();


    getChildPaths( conf, results, trim, new Path( fullIdentifier ), depth );


    return results.toArray( new String[ results.size() ] );
    }


  private void getChildPaths( Configuration conf, Set<String> results, int trim, Path path, int depth ) throws IOException
    {
    if( depth == 0 )
      {
      String substring = path.toString().substring( trim );
      String identifier = getIdentifier();


      if( identifier == null || identifier.isEmpty() )
        results.add( new Path( substring ).toString() );
      else
        results.add( new Path( identifier, substring ).toString() );


      return;
      }


    FileStatus[] statuses = getFileSystem( conf ).listStatus( path, new Utils.OutputFileUtils.OutputFilesFilter() );


    if( statuses == null )
      return;


    for( FileStatus fileStatus : statuses )
      getChildPaths( conf, results, trim, fileStatus.getPath(), depth - 1 );
    }


  @Override
  public long getModifiedTime( Configuration conf ) throws IOException
    {
    if( !resourceExists( conf ) )
      return 0;


    FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );


    if( !fileStatus.isDir() )
      return fileStatus.getModificationTime();


    // todo: this should ignore the _temporary path, or not cache if found in the array
    makeStatuses( conf );


    // statuses is empty, return 0
    if( statuses == null || statuses.length == 0 )
      return 0;


    long date = 0;


    // filter out directories as we don't recurs into sub dirs
    for( FileStatus status : statuses )
      {
      if( !status.isDir() )
        date = Math.max( date, status.getModificationTime() );
      }


    return date;
    }


  public static Path getTempPath( Configuration conf )
    {
    String tempDir = conf.get( HfsProps.TEMPORARY_DIRECTORY );


    if( tempDir == null )
      tempDir = conf.get( "hadoop.tmp.dir" );


    return new Path( tempDir );
    }


  protected String makeTemporaryPathDirString( String name )
    {
    // _ is treated as a hidden file, so wipe them out
    name = name.replaceAll( "^[_\\W\\s]+", "" );


    if( name.isEmpty() )
      name = "temp-path";


    return name.replaceAll( "[\\W\\s]+", "_" ) + Util.createUniqueID();
    }


  /**
   * Given a file-system object, it makes an array of paths
   *
   * @param conf of type JobConf
   * @throws IOException on failure
   */
  private void makeStatuses( Configuration conf ) throws IOException
    {
    if( statuses != null )
      return;


    statuses = getFileSystem( conf ).listStatus( getPath() );
    }


  /** Combined input format that uses the underlying individual input format to combine multiple files into a single split. */
  static class CombinedInputFormat extends CombineFileInputFormat implements Configurable
    {
    private Configuration conf;


    public RecordReader getRecordReader( InputSplit split, JobConf job, Reporter reporter ) throws IOException
      {
      return new CombineFileRecordReader( job, (CombineFileSplit) split, reporter, CombineFileRecordReaderWrapper.class );
      }


    @Override
    public void setConf( Configuration conf )
      {
      this.conf = conf;


      // set the aliased property value, if zero, the super class will look up the hadoop property
      setMaxSplitSize( conf.getLong( HfsProps.COMBINE_INPUT_FILES_SIZE_MAX, 0 ) );
      }


    @Override
    public Configuration getConf()
      {
      return conf;
      }
    }
  }
Source Code of cascading.tap.hadoop.Hfs$CombinedInputFormat

Related Classes of cascading.tap.hadoop.Hfs$CombinedInputFormat