package org.commoncrawl.hadoop.mergeutils;
/*
* Copyright 2010 - CommonCrawl Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey;
import org.commoncrawl.util.shared.CCStringUtils;
import org.commoncrawl.util.shared.IntrusiveList;
import org.commoncrawl.util.shared.IntrusiveList.IntrusiveListElement;
/**
* merge sort a pre-sorted set of sequence files and spill them to output
*
*
*
* @author rana
*
* @param <KeyType>
* @param <ValueType>
*/
public class SequenceFileMerger<KeyType extends WritableComparable, ValueType extends Writable> {
public enum Counters {
RECORDS_MERGED,
PCT_COMPLETED
}
public static final Log LOG = LogFactory.getLog(SequenceFileMerger.class);
// the set of input files (segments) to operate on
IntrusiveList<MergeResultSegment<KeyType, ValueType>> _segmentList = new IntrusiveList<MergeResultSegment<KeyType, ValueType>>();
// the initial segement count
int _originalSegementCount = 0;
// completed segment count
int _completedSegmentCount = 0;
// last known percent complete value
long _percentComplete = 0L;
// the output spill writer
SpillWriter<KeyType, ValueType> _writer = null;
// a reference to the raw writer interface if _writer implements
// RawDataSpillWriter
RawDataSpillWriter<KeyType, ValueType> _rawWriter = null;
// basic key value comparator used to merge files
KeyValuePairComparator<KeyType, ValueType> _comparator;
// raw comparator if supported
RawKeyValueComparator<KeyType, ValueType> _rawComparator = null;
// optimized key generator interface
OptimizedKeyGeneratorAndComparator<KeyType, ValueType> _optimizedKeyGenerator = null;
// optional combiner interface
SpillValueCombiner<KeyType, ValueType> _optionalCombiner = null;
// input record counter
long _inputRecordCount = 0;
// merged record count
long _mergedRecordCount = 0;
// optimized key type
int _optimizedKeyType = 0;
/**
* construct a basic merger using a standard basic or raw comparator
*
* @param fileSystem
* @param conf
* @param inputSegments
* @param spillWriter
* @param keyClass
* @param valueClass
* @param optionalCombiner
* @param comparator
* @throws IOException
*/
public SequenceFileMerger(FileSystem fileSystem, Configuration conf,
Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter,
Class<KeyType> keyClass, Class<ValueType> valueClass,
SpillValueCombiner<KeyType, ValueType> optionalCombiner,
KeyValuePairComparator<KeyType, ValueType> comparator) throws IOException {
// common init ...
init(fileSystem, conf, inputSegments, spillWriter, keyClass, valueClass,
comparator, null, optionalCombiner);
}
/**
* construct a specialized merger that uses an optimized key generator to
* speed merges (used by merge sort spill writer)
*
* this constructor is package private since it requires a special contract
* between mergesortspillwriter and sequencefilemerger
*
* @param fileSystem
* @param conf
* @param inputSegments
* @param spillWriter
* @param keyClass
* @param valueClass
* @param optionalKeyGenerator
* @param optionalCombiner
* @throws IOException
*/
SequenceFileMerger(FileSystem fileSystem, Configuration conf,
Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter,
Class<KeyType> keyClass, Class<ValueType> valueClass,
OptimizedKeyGeneratorAndComparator<KeyType, ValueType> keyGenerator)
throws IOException {
// initialize optimized key object
_optimizedKeyType = keyGenerator.getGeneratedKeyType();
// common init ...
init(fileSystem, conf, inputSegments, spillWriter, keyClass, valueClass,
null, keyGenerator, null);
}
/**
* construct a merger that uses an raw comparator
*
* this constructor is package private since it requires a special contract
* between mergesortspillwriter and sequencefilemerger
*
* @param fileSystem
* @param conf
* @param inputSegments
* @param spillWriter
* @param keyClass
* @param valueClass
* @param comparator
* @throws IOException
*/
public SequenceFileMerger(FileSystem fileSystem, Configuration conf,
Vector<Path> inputSegments, SpillWriter<KeyType, ValueType> spillWriter,
Class<KeyType> keyClass, Class<ValueType> valueClass,
RawKeyValueComparator<KeyType, ValueType> comparator) throws IOException {
// common init ...
init(fileSystem, conf, inputSegments, spillWriter, keyClass, valueClass,
comparator, null, null);
}
/**
* close and flush the merger
*
* @throws IOException
*/
public void close() throws IOException {
for (MergeResultSegment<KeyType, ValueType> segment : _segmentList) {
try {
segment.close();
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
/**
* merge the inputs
*
* @param reporter
* @throws IOException
*/
@SuppressWarnings("unchecked")
public void mergeAndSpill(final Reporter reporter) throws IOException {
long sortStartTime = System.currentTimeMillis();
// allocate our sort array
MergeResultSegment<KeyType, ValueType> sortArray[] = new MergeResultSegment[_segmentList
.size() + 1];
KeyType lastCombinerKey = null;
Vector<ValueType> valueBuffer = new Vector<ValueType>();
while (_segmentList.getHead() != null) {
MergeResultSegment<KeyType, ValueType> spillSegment = null;
try {
// get the head element
spillSegment = _segmentList.removeHead();
// and spill its current key/value pair ...
// LOG.info("Spilling Segment:" + spillSegment.getName() + " Key:" +
// spillSegment.getKey().toString());
// LOG.info("Spilling Segment:" + spillSegment.getName() + " Key:" +
// spillSegment.getKey().toString());
// if no combiner spill directly ...
if (_optionalCombiner == null) {
_mergedRecordCount++;
// ok in the case of optimized keys ...
if (_optimizedKeyGenerator != null) {
// spill only the raw key, skipping the optimized key part ...
/*
* LOG.info("Spilling Record From Segment:" + spillSegment.getName()
* + " OptKeyValue:" +
* spillSegment.getOptimizedKey().getLongKeyValue() + " HeaderSize:"
* + spillSegment.getOptimizedKey().getHeaderSize() + " KeySize:" +
* (spillSegment.getRawKeyData().getLength() -
* spillSegment.getOptimizedKey().getHeaderSize() - 4) +
* " KeyDataLength:" + spillSegment.getRawKeyData().getLength() );
*/
// ok segments with optimized keys have {optimized key header} +
// {original-key-len} preceeding the actual key bytes
// and optional buffer data at tail end of value
_rawWriter.spillRawRecord(spillSegment.getRawKeyData().getData(),
spillSegment.getOptimizedKey().getHeaderSize() + 4,
spillSegment.getRawKeyData().getLength()
- spillSegment.getOptimizedKey().getHeaderSize() - 4,
spillSegment.getRawValueData().getData(), 0, spillSegment
.getRawValueData().getLength()
- spillSegment.getOptimizedKey().getDataBufferSize());
} else if (_rawComparator != null) {
_rawWriter.spillRawRecord(spillSegment.getRawKeyData().getData(),
0, spillSegment.getRawKeyData().getLength(), spillSegment
.getRawValueData().getData(), 0, spillSegment
.getRawValueData().getLength());
} else {
_writer.spillRecord(spillSegment.getKey(), spillSegment.getValue());
}
} else {
if (valueBuffer.size() != 0
&& lastCombinerKey.compareTo(spillSegment.getKey()) != 0) {
// LOG.info("DEBUG:Spilling Combined Values for Key:" +
// lastCombinerKey.toString() + " Value Count:" +
// valueBuffer.size());
// combine and flush last set of values ...
_mergedRecordCount++;
_writer.spillRecord(lastCombinerKey, _optionalCombiner
.combineValues(lastCombinerKey, valueBuffer));
// clear accumulation buffer
valueBuffer.clear();
}
if (valueBuffer.size() == 0) {
// set current key as lastKey
lastCombinerKey = spillSegment.getKey();
}
// add value to buffer
valueBuffer.add(spillSegment.getValue());
}
// and see if there is a next item for the spilled segment
if (spillSegment.next()) {
_inputRecordCount++;
// yes, ok insert it back into the list at the appropriate position
// ...
if (_segmentList.size() == 0) {
_segmentList.addHead(spillSegment);
} else {
// first convert existing list to array
addItemsToArray(sortArray, _segmentList);
// next find insertion position
MergeResultSegment<KeyType, ValueType> insertionPos = _findInsertionPos(
spillSegment, sortArray, _segmentList.size());
// if null, add to head ...
if (insertionPos == null) {
// LOG.info("DEBUG:Adding Key:" + spillSegment.getKey().toString()
// + " Before:" + _segmentList.getHead().getKey().toString());
_segmentList.addHead(spillSegment);
} else {
// LOG.info("DEBUG:Adding Key:" + spillSegment.getKey().toString()
// + " After:" + insertionPos.getKey().toString());
_segmentList.insertAfter(insertionPos, spillSegment);
}
}
}
// otherwise ...
else {
// close the segment
// LOG.info("Segment:" + spillSegment.getName() +
// " Exhausted. Closing");
try {
spillSegment.close();
} catch (IOException e) {
LOG.error("Segment:" + spillSegment.getName() + " Exception:"
+ CCStringUtils.stringifyException(e));
}
finally {
_completedSegmentCount++;
}
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
if (spillSegment != null) {
LOG.error("Error during splill of segment:" + spillSegment.getName()
+ " Exception:" + CCStringUtils.stringifyException(e));
}
}
if (_mergedRecordCount % 100000 == 0) {
updateProgress(reporter);
LOG.info("Merged " + _mergedRecordCount + " Items");
}
}
updateProgress(reporter);
// now, if combiner is not null and there is a value buffered up ..
if (_optionalCombiner != null && valueBuffer.size() != 0) {
_mergedRecordCount++;
// combine and flush last set of values ...
_writer.spillRecord(lastCombinerKey, _optionalCombiner.combineValues(
lastCombinerKey, valueBuffer));
// clear combiner buffer ..
valueBuffer.clear();
}
LOG.info("Merge took:" + (System.currentTimeMillis() - sortStartTime)
+ " InputRecordCount:" + _inputRecordCount + " MergedRecordCount:"
+ _mergedRecordCount);
}
void updateProgress(Reporter reporter) {
if (reporter != null) {
// ok fraction attributed to each segment
float segmentFraction = 1.0f / _originalSegementCount;
// add in completed segments
float pctComplete = segmentFraction * _completedSegmentCount;
// add in partial segments ...
for (MergeResultSegment segment : _segmentList) {
// calculated partial completion score
pctComplete += segmentFraction * segment.getPercentComplete();
}
reporter.incrCounter(Counters.RECORDS_MERGED,_mergedRecordCount);
long pctCompleteAsLong = (long)(pctComplete * 100L);
long delta = pctCompleteAsLong - _percentComplete;
if (delta > 0) {
reporter.incrCounter(Counters.PCT_COMPLETED,delta);
_percentComplete = pctCompleteAsLong;
}
}
}
/**
* internal init method
*
* @param fileSystem
* @param conf
* @param inputSegments
* @param spillWriter
* @param keyClass
* @param valueClass
* @param comparator
* @param optionalKeyGenerator
* @param optionalCombiner
* @throws IOException
*/
private void init(
FileSystem fileSystem,
Configuration conf,
Vector<Path> inputSegments,
SpillWriter<KeyType, ValueType> spillWriter,
Class<KeyType> keyClass,
Class<ValueType> valueClass,
KeyValuePairComparator<KeyType, ValueType> comparator,
OptimizedKeyGeneratorAndComparator<KeyType, ValueType> optionalKeyGenerator,
SpillValueCombiner<KeyType, ValueType> optionalCombiner
) throws IOException {
_comparator = comparator;
_optimizedKeyGenerator = optionalKeyGenerator;
if (_comparator instanceof RawKeyValueComparator) {
_rawComparator = (RawKeyValueComparator<KeyType, ValueType>) _comparator;
}
if (_rawComparator != null && _optimizedKeyGenerator != null) {
throw new IOException(
"RawComparator not compatible with OptimizedKeyGenerator option!");
}
_optionalCombiner = optionalCombiner;
try {
Vector<MergeResultSegment<KeyType, ValueType>> segments = new Vector<MergeResultSegment<KeyType, ValueType>>();
for (Path path : inputSegments) {
// LOG.info("Loading QueryResultSegment:" + path);
MergeResultSegment<KeyType, ValueType> resultSegment = new MergeResultSegment<KeyType, ValueType>(
fileSystem, conf, path, keyClass, valueClass,
_rawComparator != null || _optimizedKeyGenerator != null,
_optimizedKeyGenerator);
if (!resultSegment.next()) {
// LOG.info("QueryResultSegment:" + path
// +" returned EOS on initial next.Ignoring Segment");
try {
resultSegment.close();
} catch (IOException e) {
LOG.error("QueryResultSegment:" + path + " Threw Exception:"
+ CCStringUtils.stringifyException(e));
}
} else {
_inputRecordCount++;
segments.add(resultSegment);
}
}
// create temporary array for sorting purposes ...
MergeResultSegment<KeyType, ValueType> segmentArray[] = segments
.toArray(new MergeResultSegment[0]);
// sort the array ...
Arrays.sort(segmentArray,
new Comparator<MergeResultSegment<KeyType, ValueType>>() {
@Override
public int compare(MergeResultSegment<KeyType, ValueType> o1,
MergeResultSegment<KeyType, ValueType> o2) {
try {
if (_optimizedKeyGenerator != null) {
int result = 0;
if ((_optimizedKeyType & OptimizedKey.KEY_TYPE_LONG) != 0) {
result = (int) (o1.getOptimizedKey().getLongKeyValue() - o2
.getOptimizedKey().getLongKeyValue());
}
if (result == 0
&& ((_optimizedKeyType & OptimizedKey.KEY_TYPE_BUFFER) != 0)) {
// compare buffers ...
result = _optimizedKeyGenerator.compareOptimizedBufferKeys(
o1.getOptimizedKey().getBufferKeyValue().get(), o1
.getOptimizedKey().getBufferKeyValue().getOffset(),
o1.getOptimizedKey().getBufferKeyValue().getCount(), o2
.getOptimizedKey().getBufferKeyValue().get(), o2
.getOptimizedKey().getBufferKeyValue().getOffset(),
o2.getOptimizedKey().getBufferKeyValue().getCount());
}
return result;
} else if (_rawComparator != null) {
return _rawComparator.compareRaw(
o1.getRawKeyData().getData(), 0, o1.getRawKeyData()
.getLength(), o2.getRawKeyData().getData(), 0, o2
.getRawKeyData().getLength(), o1.getRawValueData()
.getData(), 0, o1.getRawValueData().getLength(), o2
.getRawValueData().getData(), 0, o2.getRawValueData()
.getLength());
} else {
return _comparator.compare(o1.getKey(), o1.getValue(), o2
.getKey(), o2.getValue());
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
});
// LOG.info("Initial sorted segment list is ....");
// now store the segments in sorted order ...
int index = 0;
for (MergeResultSegment<KeyType, ValueType> segment : segmentArray) {
segment.setIndex(index++);
_segmentList.addTail(segment);
}
_originalSegementCount = segmentArray.length;
_writer = spillWriter;
if (!(_writer instanceof RawDataSpillWriter)) {
throw new IOException(
"Writer supplied with RawComparator does not implement RawDataSpillWriter");
}
_rawWriter = (RawDataSpillWriter<KeyType, ValueType>) _writer;
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
for (MergeResultSegment<KeyType, ValueType> segment : _segmentList) {
try {
segment.close();
} catch (IOException e2) {
LOG.error(CCStringUtils.stringifyException(e2));
}
}
throw e;
}
}
/**
* add merge segments to sort array
*
* @param array
* @param list
*/
private final void addItemsToArray(
MergeResultSegment<KeyType, ValueType>[] array,
IntrusiveList<MergeResultSegment<KeyType, ValueType>> list) {
MergeResultSegment<KeyType, ValueType> current = list.getHead();
int pos = 0;
while (current != null) {
array[pos++] = current;
current = current.getNext();
}
}
// do a binary search in the map to find the right value
private final MergeResultSegment<KeyType, ValueType> _findInsertionPos(
MergeResultSegment<KeyType, ValueType> searchSegment,
MergeResultSegment<KeyType, ValueType>[] array, int arrayCount)
throws IOException {
int low = 0;
int high = arrayCount - 1;
while (low <= high) {
int mid = low + ((high - low) / 2);
MergeResultSegment<KeyType, ValueType> segment = array[mid];
int compareResult = 0;
if (_optimizedKeyGenerator != null) {
if ((_optimizedKeyType & OptimizedKey.KEY_TYPE_LONG) != 0) {
compareResult = (int) (segment.getOptimizedKey().getLongKeyValue() - searchSegment
.getOptimizedKey().getLongKeyValue());
}
if (compareResult == 0
&& (_optimizedKeyType & OptimizedKey.KEY_TYPE_BUFFER) != 0) {
// compare buffers ...
compareResult = _optimizedKeyGenerator.compareOptimizedBufferKeys(
segment.getOptimizedKey().getBufferKeyValue().get(), segment
.getOptimizedKey().getBufferKeyValue().getOffset(), segment
.getOptimizedKey().getBufferKeyValue().getCount(),
searchSegment.getOptimizedKey().getBufferKeyValue().get(),
searchSegment.getOptimizedKey().getBufferKeyValue().getOffset(),
searchSegment.getOptimizedKey().getBufferKeyValue().getCount());
}
} else if (_rawComparator != null) {
compareResult = _rawComparator.compareRaw(segment.getRawKeyData()
.getData(), 0, segment.getRawKeyData().getLength(), searchSegment
.getRawKeyData().getData(), 0, searchSegment.getRawKeyData()
.getLength(), segment.getRawValueData().getData(), 0, segment
.getRawValueData().getLength(), searchSegment.getRawValueData()
.getData(), 0, searchSegment.getRawValueData().getLength());
} else {
compareResult = _comparator.compare(segment.getKey(), segment
.getValue(), searchSegment.getKey(), searchSegment.getValue());
}
// LOG.info("Compare Between" + segment.getKey().toString() + " and " +
// searchSegment.getKey() + " returned:" + compareResult);
if (compareResult > 0) {
// LOG.info("Setting high to:" + (mid - 1));
high = mid - 1;
} else if (compareResult < 0) {
// LOG.info("Setting low to:" + (mid + 1));
low = mid + 1;
} else {
// LOG.info("Found match. returning item at:" + mid);
return array[mid]; // found
}
}
// not found ... return best insertion position ...
if (high == -1) {
// LOG.info("High == -1. Returning NULL");
return null;
} else {
// LOG.info("Returning element at index:" + high);
return array[high];
}
}
private static class MergeResultSegment<KeyType extends Writable, ValueType extends Writable>
extends IntrusiveListElement<MergeResultSegment<KeyType, ValueType>> {
private static final Class[] emptyArray = new Class[] {};
SequenceFile.Reader reader = null;
KeyType key = null;
ValueType value = null;
Constructor<KeyType> keyConstructor = null;
Constructor<ValueType> valConstructor = null;
boolean eos = false;
Path path;
long lastPos;
long fileSize;
int index = -1;
boolean useRawMode = false;
DataOutputBuffer rawKeyData = null;
DataOutputBuffer rawValueData = null;
ValueBytes valueBytes = null;
OptimizedKeyGeneratorAndComparator<KeyType, ValueType> _optimizedGenerator = null;
OptimizedKey _optimizedKey = null;
float percentComplete = 0.0f;
public MergeResultSegment(FileSystem fileSystem, Configuration conf,
Path inputFile, Class<KeyType> keyClass, Class<ValueType> valueClass,
boolean useRawMode,
OptimizedKeyGeneratorAndComparator<KeyType, ValueType> optionalGenerator)
throws IOException {
try {
this.useRawMode = useRawMode;
this._optimizedGenerator = optionalGenerator;
if (_optimizedGenerator != null) {
_optimizedKey = new OptimizedKey(_optimizedGenerator
.getGeneratedKeyType());
}
this.keyConstructor = keyClass.getDeclaredConstructor(emptyArray);
this.keyConstructor.setAccessible(true);
this.valConstructor = valueClass.getDeclaredConstructor(emptyArray);
this.valConstructor.setAccessible(true);
if (useRawMode) {
rawKeyData = new DataOutputBuffer();
rawValueData = new DataOutputBuffer();
}
} catch (SecurityException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new RuntimeException(e);
} catch (NoSuchMethodException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new RuntimeException(e);
}
this.path = inputFile;
this.lastPos = 0;
this.fileSize =fileSystem.getFileStatus(inputFile).getLen();
reader = new SequenceFile.Reader(fileSystem, inputFile, conf);
if (useRawMode) {
valueBytes = reader.createValueBytes();
}
index = -1;
}
public MergeResultSegment() {
eos = true;
percentComplete = 1.0f;
}
void setIndex(int index) {
this.index = index;
}
int getIndex() {
return this.index;
}
public boolean isNullSegment() {
return reader == null;
}
public OptimizedKey getOptimizedKey() {
return _optimizedKey;
}
public KeyType getKey() throws IOException {
if (useRawMode) {
throw new IOException("getKey Unsupported in RawMode");
}
return key;
}
public ValueType getValue() throws IOException {
if (useRawMode) {
throw new IOException("getValue Unsupported in RawMode");
}
return value;
}
public DataOutputBuffer getRawKeyData() {
return rawKeyData;
}
public DataOutputBuffer getRawValueData() {
return rawValueData;
}
public float getPercentComplete() {
return percentComplete;
}
public boolean next() throws IOException {
if (!eos) {
try {
if (!useRawMode) {
key = keyConstructor.newInstance();
value = valConstructor.newInstance();
} else {
rawKeyData.reset();
rawValueData.reset();
}
} catch (Exception e) {
LOG.error("Failed to create key or value type with Exception:"
+ CCStringUtils.stringifyException(e));
throw new RuntimeException(e);
}
if (!useRawMode) {
eos = !reader.next(key, value);
} else {
eos = (reader.nextRawKey(this.rawKeyData) == -1);
if (!eos) {
if (reader.nextRawValue(valueBytes) != 0) {
valueBytes.writeUncompressedBytes(rawValueData);
}
if (!eos && _optimizedGenerator != null) {
_optimizedKey.initFromKeyValuePair(rawKeyData.getData(), 0,
rawKeyData.getLength(), rawValueData.getData(), 0,
rawValueData.getLength());
}
}
}
if (!eos) {
if (lastPos != reader.getPosition()) {
percentComplete = (float)((double)reader.getPosition()/(double)fileSize);
lastPos = reader.getPosition();
}
}
else {
percentComplete = 1.0f;
}
}
return !eos;
}
public void close() throws IOException {
if (reader != null)
reader.close();
}
public Path getPath() {
return path;
}
public String getName() {
return "Seg:" + index + "(" + path.toString() + ")";
}
}
}