Package cascading.tuple.hadoop.collect

Source Code of cascading.tuple.hadoop.collect.HadoopSpillableTupleList

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.tuple.hadoop.collect;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import cascading.flow.FlowProcess;
import cascading.flow.FlowProcessWrapper;
import cascading.tuple.TupleException;
import cascading.tuple.collect.SpillableTupleList;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.tuple.hadoop.io.HadoopTupleInputStream;
import cascading.tuple.hadoop.io.HadoopTupleOutputStream;
import cascading.tuple.io.TupleInputStream;
import cascading.tuple.io.TupleOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* SpillableTupleList is a simple {@link Iterable} object that can store an unlimited number of {@link cascading.tuple.Tuple} instances by spilling
* excess to a temporary disk file.
* <p/>
* Spills will automatically be compressed using the {@link #defaultCodecs} values. To disable compression or
* change the codecs, see {@link cascading.tuple.collect.SpillableProps#SPILL_COMPRESS} and {@link cascading.tuple.collect.SpillableProps#SPILL_CODECS}.
* <p/>
* It is recommended to add Lzo if available.
* {@code "org.apache.hadoop.io.compress.LzoCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec" }
*/
public class HadoopSpillableTupleList extends SpillableTupleList
  {
  private static final Logger LOG = LoggerFactory.getLogger( HadoopSpillableTupleList.class );

  public static final String defaultCodecs = "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec";

  /** Field codec */
  private final CompressionCodec codec;
  /** Field serializationElementWriter */
  private final TupleSerialization tupleSerialization;

  public static synchronized CompressionCodec getCodec( FlowProcess<? extends Configuration> flowProcess, String defaultCodecs )
    {
    Class<? extends CompressionCodec> codecClass = getCodecClass( flowProcess, defaultCodecs, CompressionCodec.class );

    if( codecClass == null )
      return null;

    if( flowProcess instanceof FlowProcessWrapper )
      flowProcess = ( (FlowProcessWrapper) flowProcess ).getDelegate();

    return ReflectionUtils.newInstance( codecClass, flowProcess.getConfigCopy() );
    }

  /**
   * Constructor SpillableTupleList creates a new SpillableTupleList instance using the given threshold value, and
   * the first available compression codec, if any.
   *
   * @param threshold of type long
   * @param codec     of type CompressionCodec
   */
  public HadoopSpillableTupleList( int threshold, CompressionCodec codec, Configuration configuration )
    {
    super( threshold );
    this.codec = codec;

    if( configuration == null )
      this.tupleSerialization = new TupleSerialization();
    else
      this.tupleSerialization = new TupleSerialization( configuration );
    }

  public HadoopSpillableTupleList( int threshold, TupleSerialization tupleSerialization, CompressionCodec codec )
    {
    super( threshold );
    this.tupleSerialization = tupleSerialization;
    this.codec = codec;
    }

  @Override
  protected TupleOutputStream createTupleOutputStream( File file )
    {
    OutputStream outputStream;

    try
      {
      outputStream = new FileOutputStream( file );

      Compressor compressor = null;

      if( codec != null )
        {
        compressor = getCompressor();
        outputStream = codec.createOutputStream( outputStream, compressor );
        }

      final Compressor finalCompressor = compressor;

      return new HadoopTupleOutputStream( outputStream, tupleSerialization.getElementWriter() )
      {
      @Override
      public void close() throws IOException
        {
        try
          {
          super.close();
          }
        finally
          {
          if( finalCompressor != null )
            CodecPool.returnCompressor( finalCompressor );
          }
        }
      };
      }
    catch( IOException exception )
      {
      throw new TupleException( "unable to create temporary file input stream", exception );
      }
    }

  private Compressor getCompressor()
    {
    // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up
    // so we attempt to force a gc if we see a OOME once.
    try
      {
      return CodecPool.getCompressor( codec );
      }
    catch( OutOfMemoryError error )
      {
      System.gc();
      LOG.info( "received OOME when allocating compressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error );

      return CodecPool.getCompressor( codec );
      }
    }

  @Override
  protected TupleInputStream createTupleInputStream( File file )
    {
    try
      {
      InputStream inputStream;

      inputStream = new FileInputStream( file );

      Decompressor decompressor = null;

      if( codec != null )
        {
        decompressor = getDecompressor();
        inputStream = codec.createInputStream( inputStream, decompressor );
        }

      final Decompressor finalDecompressor = decompressor;
      return new HadoopTupleInputStream( inputStream, tupleSerialization.getElementReader() )
      {
      @Override
      public void close() throws IOException
        {
        try
          {
          super.close();
          }
        finally
          {
          if( finalDecompressor != null )
            CodecPool.returnDecompressor( finalDecompressor );
          }
        }
      };
      }
    catch( IOException exception )
      {
      throw new TupleException( "unable to create temporary file output stream", exception );
      }
    }

  private Decompressor getDecompressor()
    {
    // some codecs are using direct memory, and the gc for direct memory cannot sometimes keep up
    // so we attempt to force a gc if we see a OOME once.
    try
      {
      return CodecPool.getDecompressor( codec );
      }
    catch( OutOfMemoryError error )
      {
      System.gc();
      LOG.info( "received OOME when allocating decompressor for codec: {}, retrying once", codec.getClass().getCanonicalName(), error );

      return CodecPool.getDecompressor( codec );
      }
    }
  }
TOP

Related Classes of cascading.tuple.hadoop.collect.HadoopSpillableTupleList

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.