/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cascading.tap.hadoop;
import java.beans.ConstructorProperties;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.TapException;
import cascading.tap.hadoop.io.CombineFileRecordReaderWrapper;
import cascading.tap.hadoop.io.HadoopTupleEntrySchemeCollector;
import cascading.tap.hadoop.io.HadoopTupleEntrySchemeIterator;
import cascading.tap.type.FileType;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.util.Util;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3native.NativeS3FileSystem;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Utils;
import org.apache.hadoop.mapred.lib.CombineFileInputFormat;
import org.apache.hadoop.mapred.lib.CombineFileRecordReader;
import org.apache.hadoop.mapred.lib.CombineFileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class Hfs is the base class for all Hadoop file system access. Hfs may only be used with the
* {@link cascading.flow.hadoop.HadoopFlowConnector} when creating Hadoop executable {@link cascading.flow.Flow}
* instances.
* <p/>
* Paths typically should point to a directory, where in turn all the "part" files immediately in that directory will
* be included. This is the practice Hadoop expects. Sub-directories are not included and typically result in a failure.
* <p/>
* To include sub-directories, Hadoop supports "globing". Globing is a frustrating feature and is supported more
* robustly by {@link GlobHfs} and less so by Hfs.
* <p/>
* Hfs will accept {@code /*} (wildcard) paths, but not all convenience methods like
* {@code jobConf.getSize} will behave properly or reliably. Nor can the Hfs instance
* with a wildcard path be used as a sink to write data.
* <p/>
* In those cases use GlobHfs since it is a sub-class of {@link cascading.tap.MultiSourceTap}.
* <p/>
* Optionally use {@link Dfs} or {@link Lfs} for resources specific to Hadoop Distributed file system or
* the Local file system, respectively. Using Hfs is the best practice when possible, Lfs and Dfs are conveniences.
* <p/>
* Use the Hfs class if the 'kind' of resource is unknown at design time. To use, prefix a scheme to the 'stringPath'. Where
* <code>hdfs://...</code> will denote Dfs, and <code>file://...</code> will denote Lfs.
* <p/>
* Call {@link #setTemporaryDirectory(java.util.Map, String)} to use a different temporary file directory path
* other than the current Hadoop default path.
* <p/>
* By default Cascading on Hadoop will assume any source or sink Tap using the {@code file://} URI scheme
* intends to read files from the local client filesystem (for example when using the {@code Lfs} Tap) where the Hadoop
* job jar is started, Tap so will force any MapReduce jobs reading or writing to {@code file://} resources to run in
* Hadoop "standalone mode" so that the file can be read.
* <p/>
* To change this behavior, {@link HfsProps#setLocalModeScheme(java.util.Map, String)} to set a different scheme value,
* or to "none" to disable entirely for the case the file to be read is available on every Hadoop processing node
* in the exact same path.
* <p/>
* Hfs can optionally combine multiple small files (or a series of small "blocks") into larger "splits". This reduces
* the number of resulting map tasks created by Hadoop and can improve application performance.
* <p/>
* This is enabled by calling {@link HfsProps#setUseCombinedInput(boolean)} to {@code true}. By default, merging
* or combining splits into large ones is disabled.
*/
public class Hfs extends Tap<Configuration, RecordReader, OutputCollector> implements FileType<Configuration>
{
/** Field LOG */
private static final Logger LOG = LoggerFactory.getLogger( Hfs.class );
/**
* Field TEMPORARY_DIRECTORY
*
* @deprecated see {@link HfsProps#TEMPORARY_DIRECTORY}
*/
@Deprecated
public static final String TEMPORARY_DIRECTORY = HfsProps.TEMPORARY_DIRECTORY;
/** Field stringPath */
protected String stringPath;
/** Field uriScheme */
transient URI uriScheme;
/** Field path */
transient Path path;
/** Field paths */
private transient FileStatus[] statuses; // only used by getModifiedTime
private transient String cachedPath = null;
/**
* Method setTemporaryDirectory sets the temporary directory on the given properties object.
*
* @param properties of type Map<Object,Object>
* @param tempDir of type String
* @deprecated see {@link HfsProps}
*/
@Deprecated
public static void setTemporaryDirectory( Map<Object, Object> properties, String tempDir )
{
properties.put( HfsProps.TEMPORARY_DIRECTORY, tempDir );
}
/**
* Method getTemporaryDirectory returns the configured temporary directory from the given properties object.
*
* @param properties of type Map<Object,Object>
* @return a String or null if not set
* @deprecated see {@link HfsProps}
*/
@Deprecated
public static String getTemporaryDirectory( Map<Object, Object> properties )
{
return (String) properties.get( HfsProps.TEMPORARY_DIRECTORY );
}
protected static String getLocalModeScheme( Configuration conf, String defaultValue )
{
return conf.get( HfsProps.LOCAL_MODE_SCHEME, defaultValue );
}
protected static boolean getUseCombinedInput( Configuration conf )
{
return conf.getBoolean( HfsProps.COMBINE_INPUT_FILES, false );
}
protected static boolean getCombinedInputSafeMode( Configuration conf )
{
return conf.getBoolean( HfsProps.COMBINE_INPUT_FILES_SAFE_MODE, true );
}
protected Hfs()
{
}
@ConstructorProperties({"scheme"})
protected Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme )
{
super( scheme );
}
/**
* Constructor Hfs creates a new Hfs instance.
*
* @param scheme of type Scheme
* @param stringPath of type String
*/
@ConstructorProperties({"scheme", "stringPath"})
public Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath )
{
super( scheme );
setStringPath( stringPath );
}
/**
* Constructor Hfs creates a new Hfs instance.
*
* @param scheme of type Scheme
* @param stringPath of type String
* @param replace of type boolean
*/
@Deprecated
@ConstructorProperties({"scheme", "stringPath", "replace"})
public Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath, boolean replace )
{
super( scheme, replace ? SinkMode.REPLACE : SinkMode.KEEP );
setStringPath( stringPath );
}
/**
* Constructor Hfs creates a new Hfs instance.
*
* @param scheme of type Scheme
* @param stringPath of type String
* @param sinkMode of type SinkMode
*/
@ConstructorProperties({"scheme", "stringPath", "sinkMode"})
public Hfs( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme, String stringPath, SinkMode sinkMode )
{
super( scheme, sinkMode );
setStringPath( stringPath );
}
protected void setStringPath( String stringPath )
{
this.stringPath = Util.normalizeUrl( stringPath );
}
protected void setUriScheme( URI uriScheme )
{
this.uriScheme = uriScheme;
}
public URI getURIScheme( Configuration jobConf )
{
if( uriScheme != null )
return uriScheme;
uriScheme = makeURIScheme( jobConf );
return uriScheme;
}
protected URI makeURIScheme( Configuration configuration )
{
try
{
URI uriScheme;
LOG.debug( "handling path: {}", stringPath );
URI uri = new Path( stringPath ).toUri(); // safer URI parsing
String schemeString = uri.getScheme();
String authority = uri.getAuthority();
LOG.debug( "found scheme: {}, authority: {}", schemeString, authority );
if( schemeString != null && authority != null )
uriScheme = new URI( schemeString + "://" + uri.getAuthority() );
else if( schemeString != null )
uriScheme = new URI( schemeString + ":///" );
else
uriScheme = getDefaultFileSystemURIScheme( configuration );
LOG.debug( "using uri scheme: {}", uriScheme );
return uriScheme;
}
catch( URISyntaxException exception )
{
throw new TapException( "could not determine scheme from path: " + getPath(), exception );
}
}
/**
* Method getDefaultFileSystemURIScheme returns the URI scheme for the default Hadoop FileSystem.
*
* @param configuration of type JobConf
* @return URI
*/
public URI getDefaultFileSystemURIScheme( Configuration configuration )
{
return getDefaultFileSystem( configuration ).getUri();
}
protected FileSystem getDefaultFileSystem( Configuration configuration )
{
try
{
return FileSystem.get( configuration );
}
catch( IOException exception )
{
throw new TapException( "unable to get handle to underlying filesystem", exception );
}
}
protected FileSystem getFileSystem( Configuration configuration )
{
URI scheme = getURIScheme( configuration );
try
{
return FileSystem.get( scheme, configuration );
}
catch( IOException exception )
{
throw new TapException( "unable to get handle to get filesystem for: " + scheme.getScheme(), exception );
}
}
@Override
public String getIdentifier()
{
if( cachedPath == null )
cachedPath = getPath().toString();
return cachedPath;
}
public Path getPath()
{
if( path != null )
return path;
if( stringPath == null )
throw new IllegalStateException( "path not initialized" );
path = new Path( stringPath );
return path;
}
@Override
public String getFullIdentifier( Configuration conf )
{
return getPath().makeQualified( getFileSystem( conf ) ).toString();
}
@Override
public void sourceConfInit( FlowProcess<? extends Configuration> process, Configuration conf )
{
String fullIdentifier = getFullIdentifier( conf );
applySourceConfInitIdentifiers( process, conf, fullIdentifier );
verifyNoDuplicates( conf );
}
protected static void verifyNoDuplicates( Configuration conf )
{
Path[] inputPaths = FileInputFormat.getInputPaths( HadoopUtil.asJobConfInstance( conf ) );
Set<Path> paths = new HashSet<Path>( (int) ( inputPaths.length / .75f ) );
for( Path inputPath : inputPaths )
{
if( !paths.add( inputPath ) )
throw new TapException( "may not add duplicate paths, found: " + inputPath );
}
}
protected void applySourceConfInitIdentifiers( FlowProcess<? extends Configuration> process, Configuration conf, String... fullIdentifiers )
{
for( String fullIdentifier : fullIdentifiers )
sourceConfInitAddInputPath( conf, new Path( fullIdentifier ) );
sourceConfInitComplete( process, conf );
}
protected void sourceConfInitAddInputPath( Configuration conf, Path qualifiedPath )
{
HadoopUtil.addInputPath( conf, qualifiedPath );
makeLocal( conf, qualifiedPath, "forcing job to local mode, via source: " );
}
protected void sourceConfInitComplete( FlowProcess<? extends Configuration> process, Configuration conf )
{
super.sourceConfInit( process, conf );
TupleSerialization.setSerializations( conf ); // allows Hfs to be used independent of Flow
// use CombineFileInputFormat if that is enabled
handleCombineFileInputFormat( conf );
}
/**
* Based on the configuration, handles and sets {@link CombineFileInputFormat} as the input
* format.
*/
private void handleCombineFileInputFormat( Configuration conf )
{
// if combining files, override the configuration to use CombineFileInputFormat
if( !getUseCombinedInput( conf ) )
return;
// get the prescribed individual input format from the underlying scheme so it can be used by CombinedInputFormat
String individualInputFormat = conf.get( "mapred.input.format.class" );
if( individualInputFormat == null )
throw new TapException( "input format is missing from the underlying scheme" );
if( individualInputFormat.equals( CombinedInputFormat.class.getName() ) &&
conf.get( CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT ) == null )
throw new TapException( "the input format class is already the combined input format but the underlying input format is missing" );
// if safe mode is on (default) throw an exception if the InputFormat is not a FileInputFormat, otherwise log a
// warning and don't use the CombineFileInputFormat
boolean safeMode = getCombinedInputSafeMode( conf );
if( !FileInputFormat.class.isAssignableFrom( conf.getClass( "mapred.input.format.class", null ) ) )
{
if( safeMode )
throw new TapException( "input format must be of type org.apache.hadoop.mapred.FileInputFormat, got: " + individualInputFormat );
else
LOG.warn( "not combining input splits with CombineFileInputFormat, {} is not of type org.apache.hadoop.mapred.FileInputFormat.", individualInputFormat );
}
else
{
// set the underlying individual input format
conf.set( CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT, individualInputFormat );
// override the input format class
conf.setClass( "mapred.input.format.class", CombinedInputFormat.class, InputFormat.class );
}
}
@Override
public void sinkConfInit( FlowProcess<? extends Configuration> process, Configuration conf )
{
Path qualifiedPath = new Path( getFullIdentifier( conf ) );
HadoopUtil.setOutputPath( conf, qualifiedPath );
super.sinkConfInit( process, conf );
makeLocal( conf, qualifiedPath, "forcing job to local mode, via sink: " );
TupleSerialization.setSerializations( conf ); // allows Hfs to be used independent of Flow
}
private void makeLocal( Configuration conf, Path qualifiedPath, String infoMessage )
{
String scheme = getLocalModeScheme( conf, "file" );
if( !HadoopUtil.isLocal( conf ) && qualifiedPath.toUri().getScheme().equalsIgnoreCase( scheme ) )
{
if( LOG.isInfoEnabled() )
LOG.info( infoMessage + toString() );
HadoopUtil.setLocal( conf ); // force job to run locally
}
}
@Override
public TupleEntryIterator openForRead( FlowProcess<? extends Configuration> flowProcess, RecordReader input ) throws IOException
{
// input may be null when this method is called on the client side or cluster side when accumulating
// for a HashJoin
return new HadoopTupleEntrySchemeIterator( flowProcess, this, input );
}
@Override
public TupleEntryCollector openForWrite( FlowProcess<? extends Configuration> flowProcess, OutputCollector output ) throws IOException
{
// output may be null when this method is called on the client side or cluster side when creating
// side files with the TemplateTap
return new HadoopTupleEntrySchemeCollector( flowProcess, this, output );
}
@Override
public boolean createResource( Configuration conf ) throws IOException
{
if( LOG.isDebugEnabled() )
LOG.debug( "making dirs: {}", getFullIdentifier( conf ) );
return getFileSystem( conf ).mkdirs( getPath() );
}
@Override
public boolean deleteResource( Configuration conf ) throws IOException
{
String fullIdentifier = getFullIdentifier( conf );
return deleteFullIdentifier( conf, fullIdentifier );
}
private boolean deleteFullIdentifier( Configuration conf, String fullIdentifier ) throws IOException
{
if( LOG.isDebugEnabled() )
LOG.debug( "deleting: {}", fullIdentifier );
Path fullPath = new Path( fullIdentifier );
// do not delete the root directory
if( fullPath.depth() == 0 )
return true;
FileSystem fileSystem = getFileSystem( conf );
try
{
return fileSystem.delete( fullPath, true );
}
catch( NullPointerException exception )
{
// hack to get around npe thrown when fs reaches root directory
if( !( fileSystem instanceof NativeS3FileSystem ) )
throw exception;
}
return true;
}
public boolean deleteChildResource( Configuration conf, String childIdentifier ) throws IOException
{
Path childPath = new Path( childIdentifier ).makeQualified( getFileSystem( conf ) );
if( !childPath.toString().startsWith( getFullIdentifier( conf ) ) )
return false;
return deleteFullIdentifier( conf, childPath.toString() );
}
@Override
public boolean resourceExists( Configuration conf ) throws IOException
{
// unfortunately getFileSystem( conf ).exists( getPath() ); does not account for "/*" etc
// nor is there an more efficient means to test for existence
FileStatus[] fileStatuses = getFileSystem( conf ).globStatus( getPath() );
return fileStatuses != null && fileStatuses.length > 0;
}
@Override
public boolean isDirectory( Configuration conf ) throws IOException
{
if( !resourceExists( conf ) )
{
return false;
}
return getFileSystem( conf ).getFileStatus( getPath() ).isDir();
}
@Override
public long getSize( Configuration conf ) throws IOException
{
if( !resourceExists( conf ) )
return 0;
FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );
if( fileStatus.isDir() )
return 0;
return getFileSystem( conf ).getFileStatus( getPath() ).getLen();
}
/**
* Method getBlockSize returns the {@code blocksize} specified by the underlying file system for this resource.
*
* @param conf of JobConf
* @return long
* @throws IOException when
*/
public long getBlockSize( Configuration conf ) throws IOException
{
if( !resourceExists( conf ) )
return 0;
FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );
if( fileStatus.isDir() )
return 0;
return fileStatus.getBlockSize();
}
/**
* Method getReplication returns the {@code replication} specified by the underlying file system for
* this resource.
*
* @param conf of JobConf
* @return int
* @throws IOException when
*/
public int getReplication( Configuration conf ) throws IOException
{
if( !resourceExists( conf ) )
return 0;
FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );
if( fileStatus.isDir() )
return 0;
return fileStatus.getReplication();
}
@Override
public String[] getChildIdentifiers( Configuration conf ) throws IOException
{
return getChildIdentifiers( conf, 1, false );
}
@Override
public String[] getChildIdentifiers( Configuration conf, int depth, boolean fullyQualified ) throws IOException
{
if( !resourceExists( conf ) )
return new String[ 0 ];
if( depth == 0 && !fullyQualified )
return new String[]{getIdentifier()};
String fullIdentifier = getFullIdentifier( conf );
int trim = fullyQualified ? 0 : fullIdentifier.length() + 1;
Set<String> results = new LinkedHashSet<String>();
getChildPaths( conf, results, trim, new Path( fullIdentifier ), depth );
return results.toArray( new String[ results.size() ] );
}
private void getChildPaths( Configuration conf, Set<String> results, int trim, Path path, int depth ) throws IOException
{
if( depth == 0 )
{
String substring = path.toString().substring( trim );
String identifier = getIdentifier();
if( identifier == null || identifier.isEmpty() )
results.add( new Path( substring ).toString() );
else
results.add( new Path( identifier, substring ).toString() );
return;
}
FileStatus[] statuses = getFileSystem( conf ).listStatus( path, new Utils.OutputFileUtils.OutputFilesFilter() );
if( statuses == null )
return;
for( FileStatus fileStatus : statuses )
getChildPaths( conf, results, trim, fileStatus.getPath(), depth - 1 );
}
@Override
public long getModifiedTime( Configuration conf ) throws IOException
{
if( !resourceExists( conf ) )
return 0;
FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() );
if( !fileStatus.isDir() )
return fileStatus.getModificationTime();
// todo: this should ignore the _temporary path, or not cache if found in the array
makeStatuses( conf );
// statuses is empty, return 0
if( statuses == null || statuses.length == 0 )
return 0;
long date = 0;
// filter out directories as we don't recurs into sub dirs
for( FileStatus status : statuses )
{
if( !status.isDir() )
date = Math.max( date, status.getModificationTime() );
}
return date;
}
public static Path getTempPath( Configuration conf )
{
String tempDir = conf.get( HfsProps.TEMPORARY_DIRECTORY );
if( tempDir == null )
tempDir = conf.get( "hadoop.tmp.dir" );
return new Path( tempDir );
}
protected String makeTemporaryPathDirString( String name )
{
// _ is treated as a hidden file, so wipe them out
name = name.replaceAll( "^[_\\W\\s]+", "" );
if( name.isEmpty() )
name = "temp-path";
return name.replaceAll( "[\\W\\s]+", "_" ) + Util.createUniqueID();
}
/**
* Given a file-system object, it makes an array of paths
*
* @param conf of type JobConf
* @throws IOException on failure
*/
private void makeStatuses( Configuration conf ) throws IOException
{
if( statuses != null )
return;
statuses = getFileSystem( conf ).listStatus( getPath() );
}
/** Combined input format that uses the underlying individual input format to combine multiple files into a single split. */
static class CombinedInputFormat extends CombineFileInputFormat implements Configurable
{
private Configuration conf;
public RecordReader getRecordReader( InputSplit split, JobConf job, Reporter reporter ) throws IOException
{
return new CombineFileRecordReader( job, (CombineFileSplit) split, reporter, CombineFileRecordReaderWrapper.class );
}
@Override
public void setConf( Configuration conf )
{
this.conf = conf;
// set the aliased property value, if zero, the super class will look up the hadoop property
setMaxSplitSize( conf.getLong( HfsProps.COMBINE_INPUT_FILES_SIZE_MAX, 0 ) );
}
@Override
public Configuration getConf()
{
return conf;
}
}
}