Source Code of cascading.tuple.hadoop.collect.HadoopSpillableTupleList

/*
 * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package cascading.tuple.hadoop.collect;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;


import cascading.flow.FlowProcess;
import cascading.flow.FlowProcessWrapper;
import cascading.tuple.TupleException;
import cascading.tuple.collect.SpillableTupleList;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.tuple.hadoop.io.HadoopTupleInputStream;
import cascading.tuple.hadoop.io.HadoopTupleOutputStream;
import cascading.tuple.io.TupleInputStream;
import cascading.tuple.io.TupleOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * SpillableTupleList is a simple {@link Iterable} object that can store an unlimited number of {@link cascading.tuple.Tuple} instances by spilling
 * excess to a temporary disk file.
 * <p/>
 * Spills will automatically be compressed using the {@link #defaultCodecs} values. To disable compression or
 * change the codecs, see {@link cascading.tuple.collect.SpillableProps#SPILL_COMPRESS} and {@link cascading.tuple.collect.SpillableProps#SPILL_CODECS}.
 * <p/>
 * It is recommended to add Lzo if available.
 * {@code "org.apache.hadoop.io.compress.LzoCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec" }
 */
public class HadoopSpillableTupleList extends SpillableTupleList
  {
  private static final Logger LOG = LoggerFactory.getLogger( HadoopSpillableTupleList.class );


  public static final String defaultCodecs = "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec";


  /** Field codec */
  private final CompressionCodec codec;
  /** Field serializationElementWriter */
  private final TupleSerialization tupleSerialization;


  public static synchronized CompressionCodec getCodec( FlowProcess<? extends Configuration> flowProcess, String defaultCodecs )
    {
    Class<? extends CompressionCodec> codecClass = getCodecClass( flowProcess, defaultCodecs, CompressionCodec.class );


    if( codecClass == null )
      return null;


    if( flowProcess instanceof FlowProcessWrapper )
      flowProcess = ( (FlowProcessWrapper) flowProcess ).getDelegate();


    return ReflectionUtils.newInstance( codecClass, flowProcess.getConfigCopy() );
    }


  /**
   * Constructor SpillableTupleList creates a new SpillableTupleList instance using the given threshold value, and
   * the first available compression codec, if any.
   *
   * @param threshold of type long
   * @param codec     of type CompressionCodec
   */
  public HadoopSpillableTupleList( int threshold, CompressionCodec codec, Configuration configuration )
    {
    super( threshold );
    this.codec = codec;


    if( configuration == null )
      this.tupleSerialization = new TupleSerialization();
    else
      this.tupleSerialization = new TupleSerialization( configuration );
    }


  public HadoopSpillableTupleList( int threshold, TupleSerialization tupleSerialization, CompressionCodec codec )
    {
    super( threshold );
    this.tupleSerialization = tupleSerialization;
    this.codec = codec;
    }


  @Override
  protected TupleOutputStream createTupleOutputStream( File file )
    {
    OutputStream outputStream;


    try
      {
      outputStream = new FileOutputStream( file );


      Compressor compressor = null;


      if( codec != null )
        {
        compressor = getCompressor();
        outputStream = codec.createOutputStream( outputStream, compressor );
        }


      final Compressor finalCompressor = compressor;


      return new HadoopTupleOutputStream( outputStream, tupleSerialization.getElementWriter() )
      {
      @Override
      public void close() throws IOException
        {
        try
          {
          super.close();
          }
        finally
          {
          if( finalCompressor != null )
            CodecPool.returnCompressor( finalCompressor );
          }
        }
      };
      }
    catch( IOException exception )
      {
      throw new TupleException( "unable to create temporary file input stream", exception );
      }
    }


  private Compressor getCompressor()
    {
    // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up
    // so we attempt to force a gc if we see a OOME once.
    try
      {
      return CodecPool.getCompressor( codec );
      }
    catch( OutOfMemoryError error )
      {
      System.gc();
      LOG.info( "received OOME when allocating compressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error );


      return CodecPool.getCompressor( codec );
      }
    }


  @Override
  protected TupleInputStream createTupleInputStream( File file )
    {
    try
      {
      InputStream inputStream;


      inputStream = new FileInputStream( file );


      Decompressor decompressor = null;


      if( codec != null )
        {
        decompressor = getDecompressor();
        inputStream = codec.createInputStream( inputStream, decompressor );
        }


      final Decompressor finalDecompressor = decompressor;
      return new HadoopTupleInputStream( inputStream, tupleSerialization.getElementReader() )
      {
      @Override
      public void close() throws IOException
        {
        try
          {
          super.close();
          }
        finally
          {
          if( finalDecompressor != null )
            CodecPool.returnDecompressor( finalDecompressor );
          }
        }
      };
      }
    catch( IOException exception )
      {
      throw new TupleException( "unable to create temporary file output stream", exception );
      }
    }


  private Decompressor getDecompressor()
    {
    // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up
    // so we attempt to force a gc if we see a OOME once.
    try
      {
      return CodecPool.getDecompressor( codec );
      }
    catch( OutOfMemoryError error )
      {
      System.gc();
      LOG.info( "received OOME when allocating decompressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error );


      return CodecPool.getDecompressor( codec );
      }
    }
  }
Source Code of cascading.tuple.hadoop.collect.HadoopSpillableTupleList

Related Classes of cascading.tuple.hadoop.collect.HadoopSpillableTupleList