Package cascading.scheme.hadoop

Source Code of cascading.scheme.hadoop.SequenceFile

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.scheme.hadoop;

import java.beans.ConstructorProperties;
import java.io.IOException;

import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;

/**
* A SequenceFile is a type of {@link cascading.scheme.Scheme}, which is a flat file consisting of
* binary key/value pairs. This is a space and time efficient means to store data.
*/
public class SequenceFile extends Scheme<Configuration, RecordReader, OutputCollector, Object[], Void>
  {
  /** Protected for use by TempDfs and other subclasses. Not for general consumption. */
  protected SequenceFile()
    {
    super( null );
    }

  /**
   * Creates a new SequenceFile instance that stores the given field names.
   *
   * @param fields
   */
  @ConstructorProperties({"fields"})
  public SequenceFile( Fields fields )
    {
    super( fields, fields );
    }

  @Override
  public void sourceConfInit( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf )
    {
    conf.setBoolean( "mapred.mapper.new-api", false );
    conf.setClass( "mapred.input.format.class", SequenceFileInputFormat.class, InputFormat.class );
    }

  @Override
  public void sinkConfInit( FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf )
    {
    conf.setBoolean( "mapred.mapper.new-api", false );
    conf.setClass( "mapred.output.key.class", Tuple.class, Object.class );
    conf.setClass( "mapred.output.value.class", Tuple.class, Object.class );
    conf.setClass( "mapred.output.format.class", SequenceFileOutputFormat.class, OutputFormat.class );
    }

  @Override
  public void sourcePrepare( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall )
    {
    Object[] pair = new Object[]{
      sourceCall.getInput().createKey(),
      sourceCall.getInput().createValue()
    };

    sourceCall.setContext( pair );
    }

  @Override
  public boolean source( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall ) throws IOException
    {
    Tuple key = (Tuple) sourceCall.getContext()[ 0 ];
    Tuple value = (Tuple) sourceCall.getContext()[ 1 ];
    boolean result = sourceCall.getInput().next( key, value );

    if( !result )
      return false;

    TupleEntry entry = sourceCall.getIncomingEntry();

    if( entry.hasTypes() )
      entry.setCanonicalTuple( value );
    else
      entry.setTuple( value );

    return true;
    }

  @Override
  public void sourceCleanup( FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall )
    {
    sourceCall.setContext( null );
    }

  @Override
  public void sink( FlowProcess<? extends Configuration> flowProcess, SinkCall<Void, OutputCollector> sinkCall ) throws IOException
    {
    sinkCall.getOutput().collect( Tuple.NULL, sinkCall.getOutgoingEntry().getTuple() );
    }
  }
TOP

Related Classes of cascading.scheme.hadoop.SequenceFile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.