Package org.commoncrawl.hadoop.mergeutils

Source Code of org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter$QueuedBufferItem

package org.commoncrawl.hadoop.mergeutils;

/*
*    Copyright 2010 - CommonCrawl Foundation
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
*/

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.util.concurrent.LinkedBlockingQueue;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.commoncrawl.util.shared.CCStringUtils;
import org.apache.hadoop.io.compress.CompressionCodec;

/**
*
* An output sink that writes to a sequence file
*
* @author rana
*
* @param <KeyType>
* @param <ValueType>
*/
public class SequenceFileSpillWriter<KeyType extends WritableComparable, ValueType extends Writable>
implements RawDataSpillWriter<KeyType, ValueType> {

  private static final Log LOG = LogFactory
  .getLog(SequenceFileSpillWriter.class);
  private static final int BUFFER_QUEUE_CAPACITY = 100;

  private SequenceFileIndexWriter<KeyType, ValueType> _indexWriter = null;
  private SequenceFile.Writer writer = null;
  private long _recordCount = 0;
  private FSDataOutputStream _outputStream;
  private ByteBuffer _activeBuffer = null;
  private Thread _writerThread = null;
  private IOException _writerException = null;
  private int _spillBufferSize = -1;

  // default buffer size
  public static final int DEFAULT_SPILL_BUFFER_SIZE = 1000000;
  // the number of index items we can store in ram
  public static final String SPILL_WRITER_BUFFER_SIZE_PARAM = "commoncrawl.spillwriter.buffer.size";
  // the queue capacity parameter
  public static final String QUEUE_CAPACITY_PARAM = "commoncrawl.spillwriter.queue.capacity";

  private static class QueuedBufferItem {

    QueuedBufferItem(ByteBuffer buffer) {
      _buffer = buffer;
    }

    public ByteBuffer _buffer;
  }

  private LinkedBlockingQueue<QueuedBufferItem> _bufferQueue;

  public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf,
      Path outputFilePath, Class<KeyType> keyClass,
      Class<ValueType> valueClass,
      SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter,
      CompressionCodec codec,short replicationFactor) throws IOException {

    _bufferQueue = new LinkedBlockingQueue<QueuedBufferItem>(
        conf.getInt(QUEUE_CAPACITY_PARAM,BUFFER_QUEUE_CAPACITY));
    _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM,
        DEFAULT_SPILL_BUFFER_SIZE);
    _outputStream = fileSystem.create(outputFilePath,true,10*1024*1024,replicationFactor,fileSystem.getDefaultBlockSize());
    // allocate buffer ...
    _activeBuffer = ByteBuffer.allocate(_spillBufferSize);
    // assign index writer ..
    _indexWriter = optionalIndexWriter;
   
    if (codec != null) {
      writer = SequenceFile.createWriter(conf, _outputStream, keyClass,
          valueClass, CompressionType.BLOCK, codec);
    } else {
      writer = SequenceFile.createWriter(conf, _outputStream, keyClass,
          valueClass, CompressionType.NONE, null);
    }

    _writerThread = new Thread(new Runnable() {

      @Override
      public void run() {
        // LOG.info("Writer Thread Starting");

        while (true) {

          QueuedBufferItem queuedBufferItem = null;

          try {
            queuedBufferItem = _bufferQueue.take();
          } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          }
          if (queuedBufferItem._buffer == null) {
            // LOG.info("Writer Thread received empty buffer item. Exiting");
            return;
          } else {

            ByteBuffer theBuffer = queuedBufferItem._buffer;

            // LOG.info("Writer Thread received item. Limit:" +
            // theBuffer.limit());

            // get byte pointer
            byte[] bufferAsBytes = theBuffer.array();

            int itemsWritten = 0;
            long timeStart = System.currentTimeMillis();

            while (theBuffer.remaining() != 0) {

              // now read in key length
              int keyLen = theBuffer.getInt();
              // mark key position
              int keyPos = theBuffer.position();
              // now skip past key length
              theBuffer.position(keyPos + keyLen);
              // read value length
              int valueLen = theBuffer.getInt();
              // mark value position
              int valuePosition = theBuffer.position();
              // now skip past it ...
              theBuffer.position(valuePosition + valueLen);
              // now write this out to the sequence file ...

              try {
                spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes,
                    valuePosition, valueLen);
              } catch (IOException e) {
                LOG.error("Writer Thread Failed with Error:"
                    + CCStringUtils.stringifyException(e));
                _writerException = e;
                return;
              }
              itemsWritten++;
            }
            // LOG.info("Writer Thread Finished With Buffer. Wrote:"+
            // itemsWritten + " in:" + (System.currentTimeMillis() -
            // timeStart));
          }
        }
      }

    });
    _writerThread.start();
  }

  @Override
  public void close() throws IOException {

    // flush any active buffers ...
    flushActiveBuffer();

    // wait for writer thread to terminate gracefully
    if (_writerThread != null) {
      try {
        _bufferQueue.put(new QueuedBufferItem(null));
        // LOG.info("Waiting for Writer Thread to Die");
        _writerThread.join();
        // LOG.info("Writer Thread Died. Continuing Close");
        _writerThread = null;
      } catch (InterruptedException e) {
      }
    }

    if (writer != null) {
      // close sequence file writer
      writer.close();
      writer = null;
    }

    // and index writer (if open);
    if (_indexWriter != null) {
      _indexWriter.close();
      _indexWriter = null;
    }
    // and finally output stream
    if (_outputStream != null) {
      _outputStream.close();
      _outputStream = null;
    }

    // LOG.info("Closing Spill File. RecordCount:" + _recordCount);
  }

  private static OutputStream newOutputStream(final ByteBuffer buf) {
    return new OutputStream() {

      @Override
      public void write(int b) throws IOException {
        buf.put((byte) (b & 0xff));
      }

      public void write(byte src[], int off, int len) throws IOException {
        buf.put(src, off, len);
      }
    };
  }

  @Override
  public void spillRecord(KeyType key, ValueType value) throws IOException {

    // mark buffer position ...
    int startPosition = -1;
    // LOG.info("Buffer start position:" + startPositon);

    boolean done = false;

    while (!done) {

      if (_activeBuffer == null) {
        allocateActiveBuffer();
      }

      startPosition = _activeBuffer.position();

      DataOutputStream outputStream = new DataOutputStream(
          newOutputStream(_activeBuffer));
      int keySizePos = 0;
      int valueSizePos = 0;
      int keySize = 0;
      int endPosition = 0;
      int valueSize = 0;
      boolean overflow = false;
      try {
        // save key size position
        keySizePos = _activeBuffer.position();
        // LOG.info("keySizePos:" + keySizePos);
        // skip past key length
        _activeBuffer.position(keySizePos + 4);
        // next write key and value
        key.write(outputStream);
        // now save value size position
        valueSizePos = _activeBuffer.position();
        // and calculate key size
        keySize = valueSizePos - keySizePos - 4;
        // reseek back
        _activeBuffer.position(keySizePos);
        // write out real key size
        _activeBuffer.putInt(keySize);
        // skip to value size position + 4
        _activeBuffer.position(valueSizePos + 4);
        // write out actual value
        value.write(outputStream);
        // save end position
        endPosition = _activeBuffer.position();
        // calculate value size
        valueSize = endPosition - valueSizePos - 4;
        // reseek back to value size pos
        _activeBuffer.position(valueSizePos);
        // write value size
        _activeBuffer.putInt(valueSize);
        // seek forward to end position
        _activeBuffer.position(endPosition);

        // LOG.info("keySizePos:" + keySizePos + " keySize:" + keySize +
        // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

        done = true;
      }
      // trap for buffer overflow specifically
      catch (IllegalArgumentException e) {
        // LOG.error("IllegalArgumentException detected while writing to Spill Data Buffer. Flushing");
        overflow = true;
      }
      // trap for buffer overflow specifically
      catch (BufferOverflowException e) {
        // LOG.error("Buffer Overflow detected while writing to Spill Data Buffer. Flushing");
        overflow = true;
      }

      if (overflow) {
        // LOG.error("Overflow Condition StartPosition:" + startPosition +
        // " KeySizePos:" + keySizePos + " keySize:" + keySize +
        // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

        if (startPosition == 0) {
          throw new IOException("Key + Value Size too Big for SpillBuffer !");
        }
        // reset to start position
        _activeBuffer.position(startPosition);
        // sort and spill
        flushActiveBuffer();
      }

    }

  }

  @Override
  public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength,
      final byte[] valueData, final int valueOffset, final int valueLength)
  throws IOException {

    // mark buffer position ...
    int startPosition = -1;
    // LOG.info("Buffer start position:" + startPositon);

    boolean done = false;

    while (!done) {

      if (_activeBuffer == null) {
        allocateActiveBuffer();
      }

      startPosition = _activeBuffer.position();
      int keySizePos = 0;
      int valueSizePos = 0;
      int keySize = 0;
      int endPosition = 0;
      int valueSize = 0;
      boolean overflow = false;

      try {
        // save key size position
        keySizePos = _activeBuffer.position();
        // LOG.info("keySizePos:" + keySizePos);
        // skip past key length
        _activeBuffer.position(keySizePos + 4);
        // next write key and value
        _activeBuffer.put(keyData, keyOffset, keyLength);
        // now save value size position
        valueSizePos = _activeBuffer.position();
        // and calculate key size
        keySize = valueSizePos - keySizePos - 4;
        // reseek back
        _activeBuffer.position(keySizePos);
        // write out real key size
        _activeBuffer.putInt(keySize);
        // skip to value size position + 4
        _activeBuffer.position(valueSizePos + 4);
        // write out actual value
        _activeBuffer.put(valueData, valueOffset, valueLength);
        // save end position
        endPosition = _activeBuffer.position();
        // calculate value size
        valueSize = endPosition - valueSizePos - 4;
        // reseek back to value size pos
        _activeBuffer.position(valueSizePos);
        // write value size
        _activeBuffer.putInt(valueSize);
        // seek forward to end position
        _activeBuffer.position(endPosition);

        // LOG.info("keySizePos:" + keySizePos + " keySize:" + keySize +
        // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

        done = true;
      }
      // trap for buffer overflow specifically
      catch (IllegalArgumentException e) {
        // LOG.error("IllegalArgumentException detected while writing to Spill Data Buffer. Flushing");
        overflow = true;
      }
      // trap for buffer overflow specifically
      catch (BufferOverflowException e) {
        // LOG.error("Buffer Overflow detected while writing to Spill Data Buffer. Flushing");
        overflow = true;
      }

      if (overflow) {
        // LOG.error("Overflow Condition StartPosition:" + startPosition +
        // " KeySizePos:" + keySizePos + " keySize:" + keySize +
        // " valueSizePos:" + valueSizePos + " valueSize:" + valueSize);

        if (startPosition == 0) {
          throw new IOException("Key + Value Size too Big for SpillBuffer !");
        }
        // reset to start position
        _activeBuffer.position(startPosition);
        // sort and spill
        flushActiveBuffer();
      }
    }
  }

  private void allocateActiveBuffer() throws IOException {
    // LOG.info("Allocating Buffer of Size:" + BUFFER_SIZE);
    _activeBuffer = ByteBuffer.allocate(_spillBufferSize);
  }

  private void flushActiveBuffer() throws IOException {
    try {
      if (_activeBuffer != null && _activeBuffer.position() != 0) {
        // flip the buffer ...
        _activeBuffer.flip();
        // LOG.info("Flipped Active Buffer.Limit:" + _activeBuffer.limit()+
        // ". Queueing Buffer");
        // and queue it up for a write ...
        _bufferQueue.put(new QueuedBufferItem(_activeBuffer));
      }
    } catch (InterruptedException e) {
    }
    _activeBuffer = null;
  }

  public void spillRawRecord2(byte[] keyData, int keyOffset, int keyLength,
      final byte[] valueData, final int valueOffset, final int valueLength)
  throws IOException {

    if (_indexWriter != null)
      _indexWriter.indexItem(keyData, keyOffset, keyLength, valueData,
          valueOffset, valueLength, writer.getLength());

    _recordCount++;

    writer.appendRaw(keyData, keyOffset, keyLength, new ValueBytes() {

      @Override
      public int getSize() {
        return valueLength;
      }

      @Override
      public void writeCompressedBytes(DataOutputStream outStream)
      throws IllegalArgumentException, IOException {
        throw new IOException("UnSupported Method");
      }

      @Override
      public void writeUncompressedBytes(DataOutputStream outStream)
      throws IOException {
        outStream.write(valueData, valueOffset, valueLength);
      }

    });
  }
}
TOP

Related Classes of org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter$QueuedBufferItem

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.