Package com.twitter.elephantbird.util

Source Code of com.twitter.elephantbird.util.LzoUtils

package com.twitter.elephantbird.util;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoIndexer;
import com.hadoop.compression.lzo.LzopCodec;

/**
* Miscellaneous lzo related utilities.
*/
public class LzoUtils {

  public static final Logger LOG = LoggerFactory.getLogger(LzoUtils.class);

  /**
   * A work-around to support environments with older versions of LzopCodec.
   * It might not be feasible for to select right version of hadoop-lzo
   * in some cases. This should be removed latest by EB-3.0.
   */
  private static boolean isLzopIndexSupported = false;
  static {
    try {
      isLzopIndexSupported =
        null != LzopCodec.class.getMethod("createIndexedOutputStream",
                                          OutputStream.class,
                                          DataOutputStream.class);
    } catch (Exception e) {
      // older version of hadoop-lzo.
    }
  }

  /**
   * Creates an lzop output stream. The index for the lzop is
   * also written to another at the same time if
   * <code>elephantbird.lzo.output.index</code> is set in configuration. <p>
   *
   * If the file size at closing is not larger than a single block,
   * the index file is deleted (in line with {@link LzoIndexer} behavior).
   */
  public static DataOutputStream
  getIndexedLzoOutputStream(Configuration conf, Path path) throws IOException {

    LzopCodec codec = new LzopCodec();
    codec.setConf(conf);

    final Path file = path;
    final FileSystem fs = file.getFileSystem(conf);
    FSDataOutputStream fileOut = fs.create(file, false);

    FSDataOutputStream indexOut = null;
    if (conf.getBoolean("elephantbird.lzo.output.index", false)) {
      if ( isLzopIndexSupported ) {
        Path indexPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX);
        indexOut = fs.create(indexPath, false);
      } else {
        LOG.warn("elephantbird.lzo.output.index is enabled, but LzopCodec "
            + "does not have createIndexedOutputStream method. "
            + "Please upgrade hadoop-lzo.");
      }
    }

    final boolean isIndexed = indexOut != null;

    OutputStream out = ( isIndexed ?
        codec.createIndexedOutputStream(fileOut, indexOut) :
        codec.createOutputStream(fileOut) );

    return new DataOutputStream(out) {
      // override close() to handle renaming index file.

      public void close() throws IOException {
        super.close();

        if ( isIndexed ) {
          // rename or remove the index file based on file size.

          Path tmpPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX);
          FileStatus stat = fs.getFileStatus(file);
          if (stat.getLen() <= stat.getBlockSize()) {
            fs.delete(tmpPath, false);
          } else {
            fs.rename(tmpPath, file.suffix(LzoIndex.LZO_INDEX_SUFFIX));
          }
        }
      }
    };
  }

}
TOP

Related Classes of com.twitter.elephantbird.util.LzoUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.