Source Code of cascading.scheme.hadoop.TextLine

/*
 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package cascading.scheme.hadoop;


import java.beans.ConstructorProperties;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;


import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.management.annotation.Property;
import cascading.management.annotation.PropertyDescription;
import cascading.management.annotation.Visibility;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;


import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance;


/**
 * A TextLine is a type of {@link cascading.scheme.Scheme} for plain text files. Files are broken into
 * lines. Either line-feed or carriage-return are used to signal end of line.
 * <p/>
 * By default, this scheme returns a {@link Tuple} with two fields, "offset" and "line".
 * <p/>
 * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names
 * to be used instead of the names "offset" and "line". sinkFields is a selector and is by default {@link Fields#ALL}.
 * Any available field names can be given if only a subset of the incoming fields should be used.
 * <p/>
 * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples
 * will simply be the "line" value using the given field name.
 * <p/>
 * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before
 * writing out the line.
 * <p/>
 * Note sink compression is {@link Compress#DISABLE} by default. If {@code null} is passed to the constructor
 * for the compression value, it will remain disabled.
 * <p/>
 * If any of the input files end with ".zip", an error will be thrown.
 * * <p/>
 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor
 * argument.
 */
public class TextLine extends Scheme<Configuration, RecordReader, OutputCollector, Object[], Object[]>
  {
  public enum Compress
    {
      DEFAULT, ENABLE, DISABLE
    }


  public static final String DEFAULT_CHARSET = "UTF-8";


  /** Field serialVersionUID */
  private static final long serialVersionUID = 1L;
  /** Field DEFAULT_SOURCE_FIELDS */
  public static final Fields DEFAULT_SOURCE_FIELDS = new Fields( "offset", "line" );


  /** Field sinkCompression */
  Compress sinkCompression = Compress.DISABLE;


  String charsetName = DEFAULT_CHARSET;


  /**
   * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
   * "offset" is the byte offset in the input file.
   */
  public TextLine()
    {
    super( DEFAULT_SOURCE_FIELDS );
    }


  /**
   * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
   * "offset" is the byte offset in the input file.
   *
   * @param numSinkParts of type int
   */
  @ConstructorProperties({"numSinkParts"})
  public TextLine( int numSinkParts )
    {
    super( DEFAULT_SOURCE_FIELDS, numSinkParts );
    }


  /**
   * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
   * "offset" is the byte offset in the input file.
   *
   * @param sinkCompression of type Compress
   */
  @ConstructorProperties({"sinkCompression"})
  public TextLine( Compress sinkCompression )
    {
    super( DEFAULT_SOURCE_FIELDS );


    setSinkCompression( sinkCompression );
    }


  /**
   * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields the source fields for this scheme
   * @param sinkFields   the sink fields for this scheme
   */
  @ConstructorProperties({"sourceFields", "sinkFields"})
  public TextLine( Fields sourceFields, Fields sinkFields )
    {
    super( sourceFields, sinkFields );


    verify( sourceFields );
    }


  /**
   * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields the source fields for this scheme
   * @param sinkFields   the sink fields for this scheme
   * @param charsetName  of type String
   */
  @ConstructorProperties({"sourceFields", "sinkFields", "charsetName"})
  public TextLine( Fields sourceFields, Fields sinkFields, String charsetName )
    {
    super( sourceFields, sinkFields );


    // throws an exception if not found
    setCharsetName( charsetName );


    verify( sourceFields );
    }


  /**
   * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields the source fields for this scheme
   * @param sinkFields   the sink fields for this scheme
   * @param numSinkParts of type int
   */
  @ConstructorProperties({"sourceFields", "sinkFields", "numSinkParts"})
  public TextLine( Fields sourceFields, Fields sinkFields, int numSinkParts )
    {
    super( sourceFields, sinkFields, numSinkParts );


    verify( sourceFields );
    }


  /**
   * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields    of type Fields
   * @param sinkFields      of type Fields
   * @param sinkCompression of type Compress
   */
  @ConstructorProperties({"sourceFields", "sinkFields", "sinkCompression"})
  public TextLine( Fields sourceFields, Fields sinkFields, Compress sinkCompression )
    {
    super( sourceFields, sinkFields );


    setSinkCompression( sinkCompression );


    verify( sourceFields );
    }


  /**
   * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields    of type Fields
   * @param sinkFields      of type Fields
   * @param sinkCompression of type Compress
   * @param charsetName     of type String
   */
  @ConstructorProperties({"sourceFields", "sinkFields", "sinkCompression", "charsetName"})
  public TextLine( Fields sourceFields, Fields sinkFields, Compress sinkCompression, String charsetName )
    {
    super( sourceFields, sinkFields );


    setSinkCompression( sinkCompression );


    // throws an exception if not found
    setCharsetName( charsetName );


    verify( sourceFields );
    }


  /**
   * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields    of type Fields
   * @param sinkFields      of type Fields
   * @param sinkCompression of type Compress
   * @param numSinkParts    of type int
   */
  @ConstructorProperties({"sourceFields", "sinkFields", "sinkCompression", "numSinkParts"})
  public TextLine( Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts )
    {
    super( sourceFields, sinkFields, numSinkParts );


    setSinkCompression( sinkCompression );


    verify( sourceFields );
    }


  /**
   * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields    of type Fields
   * @param sinkFields      of type Fields
   * @param sinkCompression of type Compress
   * @param numSinkParts    of type int
   * @param charsetName     of type String
   */
  @ConstructorProperties({"sourceFields", "sinkFields", "sinkCompression", "numSinkParts", "charsetName"})
  public TextLine( Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts, String charsetName )
    {
    super( sourceFields, sinkFields, numSinkParts );


    setSinkCompression( sinkCompression );


    // throws an exception if not found
    setCharsetName( charsetName );


    verify( sourceFields );
    }


  /**
   * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields the source fields for this scheme
   */
  @ConstructorProperties({"sourceFields"})
  public TextLine( Fields sourceFields )
    {
    super( sourceFields );


    verify( sourceFields );
    }


  /**
   * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples.
   *
   * @param sourceFields the source fields for this scheme
   * @param charsetName  of type String
   */
  @ConstructorProperties({"sourceFields", "charsetName"})
  public TextLine( Fields sourceFields, String charsetName )
    {
    super( sourceFields );


    // throws an exception if not found
    setCharsetName( charsetName );


    verify( sourceFields );
    }


  /**
   * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
   * subsequent tuples. The resulting data set will have numSinkParts.
   *
   * @param sourceFields the source fields for this scheme
   * @param numSinkParts of type int
   */
  @ConstructorProperties({"sourceFields", "numSinkParts"})
  public TextLine( Fields sourceFields, int numSinkParts )
    {
    super( sourceFields, numSinkParts );


    verify( sourceFields );
    }


  protected void setCharsetName( String charsetName )
    {
    if( charsetName != null )
      this.charsetName = charsetName;


    Charset.forName( this.charsetName );
    }


  @Property(name = "charset", visibility = Visibility.PUBLIC)
  @PropertyDescription(value = "character set used in this scheme.")
  public String getCharsetName()
    {
    return charsetName;
    }


  protected void verify( Fields sourceFields )
    {
    if( sourceFields.size() < 1 || sourceFields.size() > 2 )
      throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]" );
    }


  /**
   * Method getSinkCompression returns the sinkCompression of this TextLine object.
   *
   * @return the sinkCompression (type Compress) of this TextLine object.
   */
  @Property(name = "sinkCompression", visibility = Visibility.PUBLIC)
  @PropertyDescription(value = "The compression of the scheme when used in a sink.")
  public Compress getSinkCompression()
    {
    return sinkCompression;
    }


  /**
   * Method setSinkCompression sets the sinkCompression of this TextLine object. If null, compression will remain disabled.
   *
   * @param sinkCompression the sinkCompression of this TextLine object.
   */
  public void setSinkCompression( Compress sinkCompression )
    {
    if( sinkCompression != null ) // leave disabled if null
      this.sinkCompression = sinkCompression;
    }


  @Override
  public void sourceConfInit( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf )
    {
    if( hasZippedFiles( FileInputFormat.getInputPaths( asJobConfInstance( conf ) ) ) )
      throw new IllegalStateException( "cannot read zip files: " + Arrays.toString( FileInputFormat.getInputPaths( asJobConfInstance( conf ) ) ) );


    conf.setBoolean( "mapred.mapper.new-api", false );
    conf.setClass( "mapred.input.format.class", TextInputFormat.class, InputFormat.class );
    }


  private boolean hasZippedFiles( Path[] paths )
    {
    if( paths == null || paths.length == 0 )
      return false;


    boolean isZipped = paths[ 0 ].getName().endsWith( ".zip" );


    for( int i = 1; i < paths.length; i++ )
      {
      if( isZipped != paths[ i ].getName().endsWith( ".zip" ) )
        throw new IllegalStateException( "cannot mix zipped and upzipped files" );
      }


    return isZipped;
    }


  @Override
  public void presentSourceFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields )
    {
    // do nothing to change TextLine state
    }


  @Override
  public void presentSinkFields( FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields )
    {
    // do nothing to change TextLine state
    }


  @Override
  public void sinkConfInit( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf )
    {
    if( tap.getFullIdentifier( conf ).endsWith( ".zip" ) )
      throw new IllegalStateException( "cannot write zip files: " + HadoopUtil.getOutputPath( conf ) );


    conf.setBoolean( "mapred.mapper.new-api", false );


    if( getSinkCompression() == Compress.DISABLE )
      conf.setBoolean( "mapred.output.compress", false );
    else if( getSinkCompression() == Compress.ENABLE )
      conf.setBoolean( "mapred.output.compress", true );


    conf.setClass( "mapred.output.key.class", Text.class, Object.class );
    conf.setClass( "mapred.output.value.class", Text.class, Object.class );
    conf.setClass( "mapred.output.format.class", TextOutputFormat.class, OutputFormat.class );
    }


  @Override
  public void sourcePrepare( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall )
    {
    if( sourceCall.getContext() == null )
      sourceCall.setContext( new Object[ 3 ] );


    sourceCall.getContext()[ 0 ] = sourceCall.getInput().createKey();
    sourceCall.getContext()[ 1 ] = sourceCall.getInput().createValue();
    sourceCall.getContext()[ 2 ] = Charset.forName( charsetName );
    }


  @Override
  public boolean source( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) throws IOException
    {
    if( !sourceReadInput( sourceCall ) )
      return false;


    sourceHandleInput( sourceCall );


    return true;
    }


  private boolean sourceReadInput( SourceCall<Object[], RecordReader> sourceCall ) throws IOException
    {
    Object[] context = sourceCall.getContext();


    return sourceCall.getInput().next( context[ 0 ], context[ 1 ] );
    }


  protected void sourceHandleInput( SourceCall<Object[], RecordReader> sourceCall )
    {
    TupleEntry result = sourceCall.getIncomingEntry();


    int index = 0;
    Object[] context = sourceCall.getContext();


    // coerce into canonical forms
    if( getSourceFields().size() == 2 )
      result.setLong( index++, ( (LongWritable) context[ 0 ] ).get() );


    result.setString( index, makeEncodedString( context ) );
    }


  protected String makeEncodedString( Object[] context )
    {
    Text text = (Text) context[ 1 ];
    return new String( text.getBytes(), 0, text.getLength(), (Charset) context[ 2 ] );
    }


  @Override
  public void sourceCleanup( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall )
    {
    sourceCall.setContext( null );
    }


  @Override
  public void sinkPrepare( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
    {
    sinkCall.setContext( new Object[ 2 ] );


    sinkCall.getContext()[ 0 ] = new Text();
    sinkCall.getContext()[ 1 ] = Charset.forName( charsetName );
    }


  @Override
  public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall ) throws IOException
    {
    Text text = (Text) sinkCall.getContext()[ 0 ];
    Charset charset = (Charset) sinkCall.getContext()[ 1 ];
    String line = sinkCall.getOutgoingEntry().getTuple().toString();


    text.set( line.getBytes( charset ) );


    // it's ok to use NULL here so the collector does not write anything
    sinkCall.getOutput().collect( null, text );
    }
  }
Source Code of cascading.scheme.hadoop.TextLine

Related Classes of cascading.scheme.hadoop.TextLine