Package org.commoncrawl.hadoop.mergeutils

Source Code of org.commoncrawl.hadoop.mergeutils.SequenceFileMerger

package org.commoncrawl.hadoop.mergeutils;

/*
*    Copyright 2010 - CommonCrawl Foundation
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
*/

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey;
import org.commoncrawl.util.shared.CCStringUtils;
import org.commoncrawl.util.shared.IntrusiveList;
import org.commoncrawl.util.shared.IntrusiveList.IntrusiveListElement;

/**
* merge sort a pre-sorted set of sequence files and spill them to output
*
*
*
* @author rana
*
* @param <KeyType>
* @param <ValueType>
*/
public class SequenceFileMerger<KeyType extends WritableComparable, ValueType extends Writable> {

  public enum Counters {
    RECORDS_MERGED,
    PCT_COMPLETED
  }
 
  public static final Log LOG = LogFactory.getLog(SequenceFileMerger.class);

  // the set of input files (segments) to operate on
  IntrusiveList<MergeResultSegment<KeyType, ValueType>> _segmentList = new IntrusiveList<MergeResultSegment<KeyType, ValueType>>();
  // the initial segement count
  int _originalSegementCount = 0;
  // completed segment count
  int _completedSegmentCount = 0;
  // last known percent complete value
  long _percentComplete = 0L;
  // the output spill writer
  SpillWriter<KeyType, ValueType> _writer = null;
  // a reference to the raw writer interface if _writer implements
  // RawDataSpillWriter
  RawDataSpillWriter<KeyType, ValueType> _rawWriter = null;
  // basic key value comparator used to merge files
  KeyValuePairComparator<KeyType, ValueType> _comparator;
  // raw comparator if supported
  RawKeyValueComparator<KeyType, ValueType> _rawComparator = null;
  // optimized key generator interface
  OptimizedKeyGeneratorAndComparator<KeyType, ValueType> _optimizedKeyGenerator = null;
  // optional combiner interface
  SpillValueCombiner<KeyType, ValueType> _optionalCombiner = null;
  // input record counter
  long _inputRecordCount = 0;
  // merged record count
  long _mergedRecordCount = 0;
  // optimized key type
  int _optimizedKeyType = 0;

  /**
   * construct a basic merger using a standard basic or raw comparator
   *
   * @param fileSystem
   * @param conf
   * @param inputSegments
   * @param spillWriter
   * @param keyClass
   * @param valueClass
   * @param optionalCombiner
   * @param comparator
   * @throws IOException
   */
  public SequenceFileMerger(FileSystem fileSystem, Configuration conf,
      Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter,
      Class<KeyType> keyClass, Class<ValueType> valueClass,
      SpillValueCombiner<KeyType, ValueType> optionalCombiner,
      KeyValuePairComparator<KeyType, ValueType> comparator) throws IOException {
    // common init ...
    init(fileSystem, conf, inputSegments, spillWriter, keyClass, valueClass,
        comparator, null, optionalCombiner);
  }

  /**
   * construct a specialized merger that uses an optimized key generator to
   * speed merges (used by merge sort spill writer)
   *
   * this constructor is package private since it requires a special contract
   * between mergesortspillwriter and sequencefilemerger
   *
   * @param fileSystem
   * @param conf
   * @param inputSegments
   * @param spillWriter
   * @param keyClass
   * @param valueClass
   * @param optionalKeyGenerator
   * @param optionalCombiner
   * @throws IOException
   */
  SequenceFileMerger(FileSystem fileSystem, Configuration conf,
      Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter,
      Class<KeyType> keyClass, Class<ValueType> valueClass,
      OptimizedKeyGeneratorAndComparator<KeyType, ValueType> keyGenerator)
      throws IOException {

    // initialize optimized key object
    _optimizedKeyType = keyGenerator.getGeneratedKeyType();
    // common init ...
    init(fileSystem, conf, inputSegments, spillWriter, keyClass, valueClass,
        null, keyGenerator, null);
  }

  /**
   * construct a merger that uses an raw comparator
   *
   * this constructor is package private since it requires a special contract
   * between mergesortspillwriter and sequencefilemerger
   *
   * @param fileSystem
   * @param conf
   * @param inputSegments
   * @param spillWriter
   * @param keyClass
   * @param valueClass
   * @param comparator
   * @throws IOException
   */
  public SequenceFileMerger(FileSystem fileSystem, Configuration conf,
      Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter,
      Class<KeyType> keyClass, Class<ValueType> valueClass,
      RawKeyValueComparator<KeyType, ValueType> comparator) throws IOException {

    // common init ...
    init(fileSystem, conf, inputSegments, spillWriter, keyClass, valueClass,
        comparator, null, null);
  }

  /**
   * close and flush the merger
   *
   * @throws IOException
   */
  public void close() throws IOException {
    for (MergeResultSegment<KeyType, ValueType> segment : _segmentList) {
      try {
        segment.close();
      } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
      }
    }
  }

  /**
   * merge the inputs
   *
   * @param reporter
   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  public void mergeAndSpill(final Reporter reporter) throws IOException {
    long sortStartTime = System.currentTimeMillis();

    // allocate our sort array
    MergeResultSegment<KeyType, ValueType> sortArray[] = new MergeResultSegment[_segmentList
                                                                                .size() + 1];

    KeyType lastCombinerKey = null;
    Vector<ValueType> valueBuffer = new Vector<ValueType>();

    while (_segmentList.getHead() != null) {
      MergeResultSegment<KeyType, ValueType> spillSegment = null;
      try {
        // get the head element
        spillSegment = _segmentList.removeHead();
        // and spill its current key/value pair ...
        // LOG.info("Spilling Segment:" + spillSegment.getName() + " Key:" +
        // spillSegment.getKey().toString());
        // LOG.info("Spilling Segment:" + spillSegment.getName() + " Key:" +
        // spillSegment.getKey().toString());
        // if no combiner spill directly ...
        if (_optionalCombiner == null) {
          _mergedRecordCount++;
          // ok in the case of optimized keys ...
          if (_optimizedKeyGenerator != null) {
            // spill only the raw key, skipping the optimized key part ...

            /*
             * LOG.info("Spilling Record From Segment:" + spillSegment.getName()
             * + " OptKeyValue:" +
             * spillSegment.getOptimizedKey().getLongKeyValue() + " HeaderSize:"
             * + spillSegment.getOptimizedKey().getHeaderSize() + " KeySize:" +
             * (spillSegment.getRawKeyData().getLength() -
             * spillSegment.getOptimizedKey().getHeaderSize() - 4) +
             * " KeyDataLength:" + spillSegment.getRawKeyData().getLength() );
             */

            // ok segments with optimized keys have {optimized key header} +
            // {original-key-len} preceeding the actual key bytes
            // and optional buffer data at tail end of value

            _rawWriter.spillRawRecord(spillSegment.getRawKeyData().getData(),
                spillSegment.getOptimizedKey().getHeaderSize() + 4,
                spillSegment.getRawKeyData().getLength()
                - spillSegment.getOptimizedKey().getHeaderSize() - 4,
                spillSegment.getRawValueData().getData(), 0, spillSegment
                .getRawValueData().getLength()
                - spillSegment.getOptimizedKey().getDataBufferSize());

          } else if (_rawComparator != null) {
            _rawWriter.spillRawRecord(spillSegment.getRawKeyData().getData(),
                0, spillSegment.getRawKeyData().getLength(), spillSegment
                .getRawValueData().getData(), 0, spillSegment
                .getRawValueData().getLength());
          } else {
            _writer.spillRecord(spillSegment.getKey(), spillSegment.getValue());
          }
        } else {
          if (valueBuffer.size() != 0
              && lastCombinerKey.compareTo(spillSegment.getKey()) != 0) {

            // LOG.info("DEBUG:Spilling Combined Values for Key:" +
            // lastCombinerKey.toString() + " Value Count:" +
            // valueBuffer.size());
            // combine and flush last set of values ...
            _mergedRecordCount++;
            _writer.spillRecord(lastCombinerKey, _optionalCombiner
                .combineValues(lastCombinerKey, valueBuffer));
            // clear accumulation buffer
            valueBuffer.clear();
          }
          if (valueBuffer.size() == 0) {
            // set current key as lastKey
            lastCombinerKey = spillSegment.getKey();
          }
          // add value to buffer
          valueBuffer.add(spillSegment.getValue());
        }
        // and see if there is a next item for the spilled segment
        if (spillSegment.next()) {
          _inputRecordCount++;
          // yes, ok insert it back into the list at the appropriate position
          // ...
          if (_segmentList.size() == 0) {
            _segmentList.addHead(spillSegment);
          } else {
            // first convert existing list to array
            addItemsToArray(sortArray, _segmentList);
            // next find insertion position
            MergeResultSegment<KeyType, ValueType> insertionPos = _findInsertionPos(
                spillSegment, sortArray, _segmentList.size());
            // if null, add to head ...
            if (insertionPos == null) {
              // LOG.info("DEBUG:Adding Key:" + spillSegment.getKey().toString()
              // + " Before:" + _segmentList.getHead().getKey().toString());
              _segmentList.addHead(spillSegment);
            } else {
              // LOG.info("DEBUG:Adding Key:" + spillSegment.getKey().toString()
              // + " After:" + insertionPos.getKey().toString());
              _segmentList.insertAfter(insertionPos, spillSegment);
            }
          }
        }
        // otherwise ...
        else {
          // close the segment
          // LOG.info("Segment:" + spillSegment.getName() +
          // " Exhausted. Closing");
          try {
            spillSegment.close();
          } catch (IOException e) {
            LOG.error("Segment:" + spillSegment.getName() + " Exception:"
                + CCStringUtils.stringifyException(e));
          }
          finally {
            _completedSegmentCount++;
          }
        }
      } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        if (spillSegment != null) {
          LOG.error("Error during splill of segment:" + spillSegment.getName()
              + " Exception:" + CCStringUtils.stringifyException(e));
        }
      }

      if (_mergedRecordCount % 100000 == 0) {
        updateProgress(reporter);
        LOG.info("Merged " + _mergedRecordCount + " Items");
      }
    }
   
    updateProgress(reporter);
   
    // now, if combiner is not null and there is a value buffered up ..
    if (_optionalCombiner != null && valueBuffer.size() != 0) {
      _mergedRecordCount++;
      // combine and flush last set of values ...
      _writer.spillRecord(lastCombinerKey, _optionalCombiner.combineValues(
          lastCombinerKey, valueBuffer));
      // clear combiner buffer ..
      valueBuffer.clear();
    }
    LOG.info("Merge took:" + (System.currentTimeMillis() - sortStartTime)
        + " InputRecordCount:" + _inputRecordCount + " MergedRecordCount:"
        + _mergedRecordCount);
  }
 
  void updateProgress(Reporter reporter) {
    if (reporter != null) {
      // ok fraction attributed to each segment
      float segmentFraction = 1.0f / _originalSegementCount;
      // add in completed segments
      float pctComplete = segmentFraction * _completedSegmentCount;
      // add in partial segments ...
      for (MergeResultSegment segment : _segmentList) {
        // calculated partial completion score
        pctComplete += segmentFraction * segment.getPercentComplete();
      }
      reporter.incrCounter(Counters.RECORDS_MERGED,_mergedRecordCount);
      long pctCompleteAsLong = (long)(pctComplete * 100L);
      long delta = pctCompleteAsLong - _percentComplete;
      if (delta > 0) {
        reporter.incrCounter(Counters.PCT_COMPLETED,delta);
        _percentComplete = pctCompleteAsLong;
      }
    }
  }

  /**
   * internal init method
   *
   * @param fileSystem
   * @param conf
   * @param inputSegments
   * @param spillWriter
   * @param keyClass
   * @param valueClass
   * @param comparator
   * @param optionalKeyGenerator
   * @param optionalCombiner
   * @throws IOException
   */
  private void init(
      FileSystem fileSystem,
      Configuration conf,
      Vector<Path> inputSegments,
      SpillWriter<KeyType, ValueType> spillWriter,
      Class<KeyType> keyClass,
      Class<ValueType> valueClass,
      KeyValuePairComparator<KeyType, ValueType> comparator,
      OptimizedKeyGeneratorAndComparator<KeyType, ValueType> optionalKeyGenerator,
      SpillValueCombiner<KeyType, ValueType> optionalCombiner

  ) throws IOException {

    _comparator = comparator;
    _optimizedKeyGenerator = optionalKeyGenerator;
    if (_comparator instanceof RawKeyValueComparator) {
      _rawComparator = (RawKeyValueComparator<KeyType, ValueType>) _comparator;
    }
    if (_rawComparator != null && _optimizedKeyGenerator != null) {
      throw new IOException(
      "RawComparator not compatible with OptimizedKeyGenerator option!");
    }
    _optionalCombiner = optionalCombiner;

    try {

      Vector<MergeResultSegment<KeyType, ValueType>> segments = new Vector<MergeResultSegment<KeyType, ValueType>>();

      for (Path path : inputSegments) {
        // LOG.info("Loading QueryResultSegment:" + path);
        MergeResultSegment<KeyType, ValueType> resultSegment = new MergeResultSegment<KeyType, ValueType>(
            fileSystem, conf, path, keyClass, valueClass,
            _rawComparator != null || _optimizedKeyGenerator != null,
            _optimizedKeyGenerator);
        if (!resultSegment.next()) {
          // LOG.info("QueryResultSegment:" + path
          // +" returned EOS on initial next.Ignoring Segment");
          try {
            resultSegment.close();
          } catch (IOException e) {
            LOG.error("QueryResultSegment:" + path + " Threw Exception:"
                + CCStringUtils.stringifyException(e));
          }
        } else {
          _inputRecordCount++;
          segments.add(resultSegment);
        }
      }

      // create temporary array for sorting purposes ...
      MergeResultSegment<KeyType, ValueType> segmentArray[] = segments
      .toArray(new MergeResultSegment[0]);
      // sort the array ...
      Arrays.sort(segmentArray,
          new Comparator<MergeResultSegment<KeyType, ValueType>>() {

        @Override
        public int compare(MergeResultSegment<KeyType, ValueType> o1,
            MergeResultSegment<KeyType, ValueType> o2) {
          try {
            if (_optimizedKeyGenerator != null) {
              int result = 0;
              if ((_optimizedKeyType & OptimizedKey.KEY_TYPE_LONG) != 0) {
                result = (int) (o1.getOptimizedKey().getLongKeyValue() - o2
                    .getOptimizedKey().getLongKeyValue());
              }
              if (result == 0
                  && ((_optimizedKeyType & OptimizedKey.KEY_TYPE_BUFFER) != 0)) {
                // compare buffers ...
                result = _optimizedKeyGenerator.compareOptimizedBufferKeys(
                    o1.getOptimizedKey().getBufferKeyValue().get(), o1
                    .getOptimizedKey().getBufferKeyValue().getOffset(),
                    o1.getOptimizedKey().getBufferKeyValue().getCount(), o2
                    .getOptimizedKey().getBufferKeyValue().get(), o2
                    .getOptimizedKey().getBufferKeyValue().getOffset(),
                    o2.getOptimizedKey().getBufferKeyValue().getCount());

              }
              return result;
            } else if (_rawComparator != null) {
              return _rawComparator.compareRaw(
                  o1.getRawKeyData().getData(), 0, o1.getRawKeyData()
                  .getLength(), o2.getRawKeyData().getData(), 0, o2
                  .getRawKeyData().getLength(), o1.getRawValueData()
                  .getData(), 0, o1.getRawValueData().getLength(), o2
                  .getRawValueData().getData(), 0, o2.getRawValueData()
                  .getLength());
            } else {
              return _comparator.compare(o1.getKey(), o1.getValue(), o2
                  .getKey(), o2.getValue());
            }
          } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            throw new RuntimeException(e);
          }
        }
      });
      // LOG.info("Initial sorted segment list is ....");
      // now store the segments in sorted order ...
      int index = 0;
      for (MergeResultSegment<KeyType, ValueType> segment : segmentArray) {
        segment.setIndex(index++);
        _segmentList.addTail(segment);
      }
      _originalSegementCount = segmentArray.length;

      _writer = spillWriter;
      if (!(_writer instanceof RawDataSpillWriter)) {
        throw new IOException(
        "Writer supplied with RawComparator does not implement RawDataSpillWriter");
      }
      _rawWriter = (RawDataSpillWriter<KeyType, ValueType>) _writer;

    } catch (IOException e) {
      LOG.error(CCStringUtils.stringifyException(e));

      for (MergeResultSegment<KeyType, ValueType> segment : _segmentList) {
        try {
          segment.close();
        } catch (IOException e2) {
          LOG.error(CCStringUtils.stringifyException(e2));
        }
      }
      throw e;
    }
  }

  /**
   * add merge segments to sort array
   *
   * @param array
   * @param list
   */
  private final void addItemsToArray(
      MergeResultSegment<KeyType, ValueType>[] array,
      IntrusiveList<MergeResultSegment<KeyType, ValueType>> list) {
    MergeResultSegment<KeyType, ValueType> current = list.getHead();
    int pos = 0;
    while (current != null) {
      array[pos++] = current;
      current = current.getNext();
    }
  }

  // do a binary search in the map to find the right value
  private final MergeResultSegment<KeyType, ValueType> _findInsertionPos(
      MergeResultSegment<KeyType, ValueType> searchSegment,
      MergeResultSegment<KeyType, ValueType>[] array, int arrayCount)
      throws IOException {

    int low = 0;
    int high = arrayCount - 1;

    while (low <= high) {
      int mid = low + ((high - low) / 2);

      MergeResultSegment<KeyType, ValueType> segment = array[mid];

      int compareResult = 0;
      if (_optimizedKeyGenerator != null) {
        if ((_optimizedKeyType & OptimizedKey.KEY_TYPE_LONG) != 0) {
          compareResult = (int) (segment.getOptimizedKey().getLongKeyValue() - searchSegment
              .getOptimizedKey().getLongKeyValue());
        }
        if (compareResult == 0
            && (_optimizedKeyType & OptimizedKey.KEY_TYPE_BUFFER) != 0) {
          // compare buffers ...
          compareResult = _optimizedKeyGenerator.compareOptimizedBufferKeys(
              segment.getOptimizedKey().getBufferKeyValue().get(), segment
              .getOptimizedKey().getBufferKeyValue().getOffset(), segment
              .getOptimizedKey().getBufferKeyValue().getCount(),
              searchSegment.getOptimizedKey().getBufferKeyValue().get(),
              searchSegment.getOptimizedKey().getBufferKeyValue().getOffset(),
              searchSegment.getOptimizedKey().getBufferKeyValue().getCount());
        }
      } else if (_rawComparator != null) {
        compareResult = _rawComparator.compareRaw(segment.getRawKeyData()
            .getData(), 0, segment.getRawKeyData().getLength(), searchSegment
            .getRawKeyData().getData(), 0, searchSegment.getRawKeyData()
            .getLength(), segment.getRawValueData().getData(), 0, segment
            .getRawValueData().getLength(), searchSegment.getRawValueData()
            .getData(), 0, searchSegment.getRawValueData().getLength());

      } else {
        compareResult = _comparator.compare(segment.getKey(), segment
            .getValue(), searchSegment.getKey(), searchSegment.getValue());
      }

      // LOG.info("Compare Between" + segment.getKey().toString() + " and " +
      // searchSegment.getKey() + " returned:" + compareResult);
      if (compareResult > 0) {
        // LOG.info("Setting high to:" + (mid - 1));
        high = mid - 1;
      } else if (compareResult < 0) {
        // LOG.info("Setting low to:" + (mid + 1));
        low = mid + 1;
      } else {
        // LOG.info("Found match. returning item at:" + mid);
        return array[mid]; // found
      }

    }
    // not found ... return best insertion position ...
    if (high == -1) {
      // LOG.info("High == -1. Returning NULL");
      return null;
    } else {
      // LOG.info("Returning element at index:" + high);
      return array[high];
    }
  }

  private static class MergeResultSegment<KeyType extends Writable, ValueType extends Writable>
  extends IntrusiveListElement<MergeResultSegment<KeyType, ValueType>> {

    private static final Class[] emptyArray = new Class[] {};

    SequenceFile.Reader reader = null;
    KeyType key = null;
    ValueType value = null;
    Constructor<KeyType> keyConstructor = null;
    Constructor<ValueType> valConstructor = null;
    boolean eos = false;
    Path path;
    long lastPos;
    long fileSize;
    int index = -1;
    boolean useRawMode = false;
    DataOutputBuffer rawKeyData = null;
    DataOutputBuffer rawValueData = null;
    ValueBytes valueBytes = null;
    OptimizedKeyGeneratorAndComparator<KeyType, ValueType> _optimizedGenerator = null;
    OptimizedKey _optimizedKey = null;
    float percentComplete = 0.0f;

    public MergeResultSegment(FileSystem fileSystem, Configuration conf,
        Path inputFile, Class<KeyType> keyClass, Class<ValueType> valueClass,
        boolean useRawMode,
        OptimizedKeyGeneratorAndComparator<KeyType, ValueType> optionalGenerator)
    throws IOException {
      try {
        this.useRawMode = useRawMode;
        this._optimizedGenerator = optionalGenerator;
        if (_optimizedGenerator != null) {
          _optimizedKey = new OptimizedKey(_optimizedGenerator
              .getGeneratedKeyType());
        }
        this.keyConstructor = keyClass.getDeclaredConstructor(emptyArray);
        this.keyConstructor.setAccessible(true);
        this.valConstructor = valueClass.getDeclaredConstructor(emptyArray);
        this.valConstructor.setAccessible(true);
        if (useRawMode) {
          rawKeyData = new DataOutputBuffer();
          rawValueData = new DataOutputBuffer();
        }
      } catch (SecurityException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        throw new RuntimeException(e);
      } catch (NoSuchMethodException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        throw new RuntimeException(e);
      }
      this.path = inputFile;
      this.lastPos = 0;
      this.fileSize =fileSystem.getFileStatus(inputFile).getLen();
     
      reader = new SequenceFile.Reader(fileSystem, inputFile, conf);
      if (useRawMode) {
        valueBytes = reader.createValueBytes();
      }
      index = -1;
    }

    public MergeResultSegment() {
      eos = true;
      percentComplete = 1.0f;
    }

    void setIndex(int index) {
      this.index = index;
    }

    int getIndex() {
      return this.index;
    }

    public boolean isNullSegment() {
      return reader == null;
    }

    public OptimizedKey getOptimizedKey() {
      return _optimizedKey;
    }

    public KeyType getKey() throws IOException {
      if (useRawMode) {
        throw new IOException("getKey Unsupported in RawMode");
      }
      return key;
    }

    public ValueType getValue() throws IOException {
      if (useRawMode) {
        throw new IOException("getValue Unsupported in RawMode");
      }
      return value;
    }

    public DataOutputBuffer getRawKeyData() {
      return rawKeyData;
    }

    public DataOutputBuffer getRawValueData() {
      return rawValueData;
    }

    public float getPercentComplete() {
      return percentComplete;
    }
   
    public boolean next() throws IOException {
      if (!eos) {
        try {
          if (!useRawMode) {
            key = keyConstructor.newInstance();
            value = valConstructor.newInstance();
          } else {
            rawKeyData.reset();
            rawValueData.reset();
          }
        } catch (Exception e) {
          LOG.error("Failed to create key or value type with Exception:"
              + CCStringUtils.stringifyException(e));
          throw new RuntimeException(e);
        }
        if (!useRawMode) {
          eos = !reader.next(key, value);
        } else {
          eos = (reader.nextRawKey(this.rawKeyData) == -1);
          if (!eos) {
            if (reader.nextRawValue(valueBytes) != 0) {
              valueBytes.writeUncompressedBytes(rawValueData);
            }

            if (!eos && _optimizedGenerator != null) {
              _optimizedKey.initFromKeyValuePair(rawKeyData.getData(), 0,
                  rawKeyData.getLength(), rawValueData.getData(), 0,
                  rawValueData.getLength());
            }
          }
        }
        if (!eos) {
          if (lastPos != reader.getPosition()) {
            percentComplete = (float)((double)reader.getPosition()/(double)fileSize);
            lastPos = reader.getPosition();
          }
        }
        else {
          percentComplete = 1.0f;
        }
      }
      return !eos;
    }

    public void close() throws IOException {
      if (reader != null)
        reader.close();
    }

    public Path getPath() {
      return path;
    }

    public String getName() {
      return "Seg:" + index + "(" + path.toString() + ")";
    }

  }
}
TOP

Related Classes of org.commoncrawl.hadoop.mergeutils.SequenceFileMerger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.