Package org.apache.pig.piggybank.storage

Source Code of org.apache.pig.piggybank.storage.IndexedStorageInputFormat$IndexedStorageRecordReader$IndexedStorageLineReader

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.pig.piggybank.storage;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.IndexableLoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextOutputFormat;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
import org.apache.pig.piggybank.storage.IndexedStorage.IndexedStorageInputFormat.IndexedStorageRecordReader;
import org.apache.pig.piggybank.storage.IndexedStorage.IndexedStorageInputFormat.IndexedStorageRecordReader.IndexedStorageRecordReaderComparator;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.DataReaderWriter;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.data.DataType;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;

/**
* <code>IndexedStorage</code> is a form of <code>PigStorage</code> that supports a
* per record seek.  <code>IndexedStorage</code> creates a separate (hidden) index file for
* every data file that is written.  The format of the index file is:
* <pre>
* | Header     |
* | Index Body |
* | Footer     |
* </pre>
* The Header contains the list of record indices (field numbers) that represent index keys.
* The Index Body contains a <code>Tuple</code> for each record in the data. 
* The fields of the <code>Tuple</code> are:
* <ul>
* <li> The index key(s) <code>Tuple</code> </li>
* <li> The number of records that share this index key. </li>
* <li> Offset into the data file to read the first matching record. </li>
* </ul>
* The Footer contains sequentially:
* <ul>
* <li> The smallest key(s) <code>Tuple</code> in the index. </li>
* <li> The largest key(s) <code>Tuple</code> in the index. </li>
* <li> The offset in bytes to the start of the footer </li>
* </ul>
*
* <code>IndexStorage</code> implements <code>IndexableLoadFunc</code> and
* can be used as the 'right table' in a PIG 'merge' or 'merge-sparse' join.
*
* <code>IndexStorage</code> does not require the data to be globally partitioned & sorted
* by index keys.  Each partition (separate index) must be locally sorted.
*
* Also note IndexStorage is a loader to demonstrate "merge-sparse" join.
*/
public class IndexedStorage extends PigStorage implements IndexableLoadFunc {

  /**
   * Constructs a Pig Storer that uses specified regex as a field delimiter.
   * @param delimiter - field delimiter to use
   * @param offsetsToIndexKeys - list of offset into Tuple for index keys (comma separated)
   */
  public IndexedStorage(String delimiter, String offsetsToIndexKeys) {
    super(delimiter);

    this.fieldDelimiter = StorageUtil.parseFieldDel(delimiter);

    String[] stroffsetsToIndexKeys = offsetsToIndexKeys.split(",");
    this.offsetsToIndexKeys = new int[stroffsetsToIndexKeys.length];
    for (int i = 0; i < stroffsetsToIndexKeys.length; ++i) {
      this.offsetsToIndexKeys[i] = Integer.parseInt(stroffsetsToIndexKeys[i]);
    }
  }

  @Override
  public OutputFormat getOutputFormat() {
    return new IndexedStorageOutputFormat(fieldDelimiter, offsetsToIndexKeys);
  }

  /**
   * Assumes this list of readers is already sorted except for the provided element.
         * This element is bubbled up the array to its appropriate sort location
         * (faster than doing a Utils sort).
   */
  private void sortReader(int startIndex) {
    int idx = startIndex;
    while (idx < this.readers.length - 1) {
      IndexedStorageRecordReader reader1 = this.readers[idx];
      IndexedStorageRecordReader reader2 = this.readers[idx+1];
      if (this.readerComparator.compare(reader1, reader2) <= 0) {
        return;
      }
      this.readers[idx] = reader2;
      this.readers[idx+1] = reader1;
      idx++;
    }
  }

  /**
   * Internal OutputFormat class
   */
  public static class IndexedStorageOutputFormat extends PigTextOutputFormat {

    public IndexedStorageOutputFormat(byte delimiter, int[] offsetsToIndexKeys) {
      /* Call the base class constructor */
      super(delimiter);

      this.fieldDelimiter = delimiter;
      this.offsetsToIndexKeys = offsetsToIndexKeys;
    }

    @Override
    public RecordWriter<WritableComparable, Tuple> getRecordWriter(
        TaskAttemptContext context) throws IOException,
        InterruptedException {

      Configuration conf = context.getConfiguration();

      FileSystem fs = FileSystem.get(conf);
      Path file = this.getDefaultWorkFile(context, "");   
      FSDataOutputStream fileOut = fs.create(file, false);

      IndexManager indexManager = new IndexManager(offsetsToIndexKeys);
      indexManager.createIndexFile(fs, file);
      return new IndexedStorageRecordWriter(fileOut, this.fieldDelimiter, indexManager);
    }

    /**
     * Internal class to do the actual record writing and index generation
     *
     */
    public static class IndexedStorageRecordWriter extends PigLineRecordWriter {

      public IndexedStorageRecordWriter(FSDataOutputStream fileOut, byte fieldDel, IndexManager indexManager) throws IOException {
        super(fileOut, fieldDel);

        this.fileOut = fileOut;
        this.indexManager = indexManager;

        /* Write the index header first */
        this.indexManager.WriteIndexHeader();
      }

      @Override
      public void write(WritableComparable key, Tuple value) throws IOException {
        /* Write the data */
        long offset = this.fileOut.getPos();
        super.write(key, value);

        /* Build index */
        this.indexManager.BuildIndex(value, offset)
      }

      @Override
      public void close(TaskAttemptContext context)
      throws IOException {
        this.indexManager.WriterIndexFooter();
        this.indexManager.Close();
        super.close(context);
      }

      /**
       * Output stream for data
       */
      private FSDataOutputStream fileOut;

      /**
       * Index builder
       */
      private IndexManager indexManager = null;
    }

    /**
     * Delimiter to use between fields
     */
    final private byte fieldDelimiter;

    /**
     * Offsets to index keys in given tuple
     */
    final protected int[] offsetsToIndexKeys;
  }


  @Override
  public InputFormat getInputFormat() {
    return new IndexedStorageInputFormat();
  }
 
  @Override
  public Tuple getNext() throws IOException {
    if (this.readers == null) {
      return super.getNext();
    }

    while (currentReaderIndexStart < this.readers.length) {
      IndexedStorageRecordReader r = this.readers[currentReaderIndexStart];

      this.prepareToRead(r, null);
      Tuple tuple = super.getNext();
      if (tuple == null) {
        currentReaderIndexStart++;
        r.close();
        continue; //next Reader
      }
   
      //if we haven't yet initialized the indexManager (by reading the first index key) 
      if (r.indexManager.lastIndexKeyTuple == null) {

        //initialize the indexManager
        if (r.indexManager.ReadIndex() == null) {
          //There should never be a case where there is a non-null record - but no corresponding index.
          throw new IOException("Missing Index for Tuple: " + tuple);
        }
      }

      r.indexManager.numberOfTuples--;

      if (r.indexManager.numberOfTuples == 0) {
        if (r.indexManager.ReadIndex() == null) {
          r.close();
          currentReaderIndexStart++;
        } else {
          //Since the index of the current reader was increased, we may need to push the
          //current reader back in the sorted list of readers.
          sortReader(currentReaderIndexStart);
        }
      }
      return tuple;
    }

    return null;
  }

  /**
   * IndexableLoadFunc interface implementation
   */
  @Override
  public void initialize(Configuration conf) throws IOException {
    try {
      InputFormat inputFormat = this.getInputFormat();
      TaskAttemptID id = TaskAttemptID.forName(conf.get("mapred.task.id"));
     
      if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
                    conf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
      }
      List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
      this.readers = new IndexedStorageRecordReader[fileSplits.size()];
     
      int idx = 0;
      Iterator<FileSplit> it = fileSplits.iterator();
      while (it.hasNext()) {
        FileSplit fileSplit = it.next();
        TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
        IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
        r.initialize(fileSplit, context);
        this.readers[idx] = r;
        idx++;
      }

      Arrays.sort(this.readers, this.readerComparator);
    } catch (InterruptedException e) {
      throw new IOException(e);
    }   
  }

  @Override
  /* The list of readers is always sorted before and after this call. */
  public void seekNear(Tuple keys) throws IOException {

    /* Keeps track of the last (if any) reader where seekNear was called */
    int lastIndexModified = -1;

    int idx = currentReaderIndexStart;
    while (idx < this.readers.length) {
      IndexedStorageRecordReader r = this.readers[idx];

      /* The key falls within the range of the reader index */
      if (keys.compareTo(r.indexManager.maxIndexKeyTuple) <= 0 && keys.compareTo(r.indexManager.minIndexKeyTuple) >= 0) {
        r.seekNear(keys);
        lastIndexModified = idx;

      /* The key is greater than the current range of the reader index */
      } else if (keys.compareTo(r.indexManager.maxIndexKeyTuple) > 0) {
        currentReaderIndexStart++;
      /* DO NOTHING - The key is less than the current range of the reader index */
      } else {
        break;
      }
      idx++;
    }

    /*
     * There is something to sort. 
     * We can rely on the following invariants that make the following check accurate:
      *  - currentReaderIndexStart is always >= 0.
     *  - lastIndexModified is only positive if seekNear was called.
     *  - lastIndexModified >= currentReaderIndexStart if lastIndexModifed >= 0.  This is true because the list
      * is already sorted.
     */
    if (lastIndexModified - currentReaderIndexStart >= 0) {

      /*
       * The following logic is optimized for the (common) case where there are a tiny number of readers that
             * need to be repositioned relative to the other readers in the much larger sorted list.
       */

      /* First, just sort the readers that were updated relative to one another. */
      Arrays.sort(this.readers, currentReaderIndexStart, lastIndexModified+1, this.readerComparator);

      /* In descending order, push the updated readers back in the the sorted list. */
      for (idx = lastIndexModified; idx >= currentReaderIndexStart; idx--) {
        sortReader(idx);
      }
    }
  }


  @Override
  public void close() throws IOException {
    for (IndexedStorageRecordReader reader : this.readers) {
      reader.close();
    }
  }

  /**
   * <code>IndexManager</code> manages the index file (both writing and reading)
   * It keeps track of the last index read during reading.
   */
  public static class IndexManager {

    /**
     * Constructor (called during reading)
     * @param ifile index file to read
     */
    public IndexManager(FileStatus ifile) {
      this.indexFile = ifile;
      this.offsetToFooter = -1;
    }

    /**
     * Constructor (called during writing)
     * @param offsetsToIndexKeys
     */
    public IndexManager(int[] offsetsToIndexKeys) {
      this.offsetsToIndexKeys = offsetsToIndexKeys;
      this.offsetToFooter = -1;
    }

    /**
     * Construct index file path for a given a data file
     * @param file - Data file
     * @return - Index file path for given data file
     */
    private static Path getIndexFileName(Path file) {
      return new Path(file.getParent(), "." + file.getName() + ".index");
    }

    /**
     * Open the index file for writing for given data file
     * @param fs
     * @param file
     * @throws IOException
     */
    public void createIndexFile(FileSystem fs, Path file) throws IOException {
      this.indexOut = fs.create(IndexManager.getIndexFileName(file), false)
    }

    /**
      * Opens the index file.
     */
    public void openIndexFile(FileSystem fs) throws IOException {
      this.indexIn = fs.open(this.indexFile.getPath())
    }

    /**
     * Close the index file
     * @throws IOException
     */
    public void Close() throws IOException {
      this.indexOut.close();
    }

    /**
     * Build index tuple
     *
     * @throws IOException
     */
    private void BuildIndex(Tuple t, long offset) throws IOException {
      /* Build index key tuple */
      Tuple indexKeyTuple = tupleFactory.newTuple(this.offsetsToIndexKeys.length);
      for (int i = 0; i < this.offsetsToIndexKeys.length; ++i) {
        indexKeyTuple.set(i, t.get(this.offsetsToIndexKeys[i]));
      }

      /* Check if we have already seen Tuple(s) with same index keys */
      if (indexKeyTuple.compareTo(this.lastIndexKeyTuple) == 0) {
        /* We have seen Tuple(s) with given index keys, update the tuple count */
        this.numberOfTuples += 1;
      }
      else {
        if (this.lastIndexKeyTuple != null)
          this.WriteIndex();

        this.lastIndexKeyTuple = indexKeyTuple;
        this.minIndexKeyTuple = ((this.minIndexKeyTuple == null) || (indexKeyTuple.compareTo(this.minIndexKeyTuple) < 0)) ? indexKeyTuple : this.minIndexKeyTuple;
        this.maxIndexKeyTuple = ((this.maxIndexKeyTuple == null) || (indexKeyTuple.compareTo(this.maxIndexKeyTuple) > 0)) ? indexKeyTuple : this.maxIndexKeyTuple;

        /* New index tuple for newly seen index key */
        this.indexTuple = tupleFactory.newTuple(3);

        /* Add index keys to index Tuple */
        this.indexTuple.set(0, indexKeyTuple);

        /* Reset Tuple count for index key */
        this.numberOfTuples = 1;

        /* Remember offset to Tuple with new index keys */
        this.indexTuple.set(2, offset);
      }
    }


    /**
     * Write index header
     * @param indexOut - Stream to write to
     * @param ih - Index header to write
     * @throws IOException
     */
    public void WriteIndexHeader() throws IOException {
      /* Number of index keys */
      indexOut.writeInt(this.offsetsToIndexKeys.length);

      /* Offset to index keys */
      for (int i = 0; i < this.offsetsToIndexKeys.length; ++i) {
        indexOut.writeInt(this.offsetsToIndexKeys[i]);
      }
    }

    /**
     * Read index header
     * @param indexIn - Stream to read from
     * @return Index header
     * @throws IOException
     */
    public void ReadIndexHeader() throws IOException {
      /* Number of index keys */
      int nkeys = this.indexIn.readInt();

      /* Offset to index keys */
      this.offsetsToIndexKeys = new int[nkeys];
      for (int i = 0; i < nkeys; ++i) {
        offsetsToIndexKeys[i] = this.indexIn.readInt();
      }
    }

    /**
     * Writes the index footer
     */
    public void WriterIndexFooter() throws IOException {
      /* Flush indexes for remaining records */
      this.WriteIndex();

      /* record the offset to footer */
      this.offsetToFooter = this.indexOut.getPos();

      /* Write index footer */
      DataReaderWriter.writeDatum(indexOut, this.minIndexKeyTuple);
      DataReaderWriter.writeDatum(indexOut, this.maxIndexKeyTuple);

      /* Offset to footer */
      indexOut.writeLong(this.offsetToFooter);
    }

    /**
     * Reads the index footer
     */
    public void ReadIndexFooter() throws IOException {
      long currentOffset = this.indexIn.getPos();

      this.SeekToIndexFooter();
      this.minIndexKeyTuple = (Tuple)DataReaderWriter.readDatum(this.indexIn);
      this.maxIndexKeyTuple = (Tuple)DataReaderWriter.readDatum(this.indexIn);

      this.indexIn.seek(currentOffset);
    }

    /**
     * Seeks to the index footer
     */
    public void SeekToIndexFooter() throws IOException {
      if (this.offsetToFooter < 0) {
        /* offset to footer is at last long (8 bytes) in the file */
        this.indexIn.seek(this.indexFile.getLen()-8);
        this.offsetToFooter = this.indexIn.readLong();
      }
      this.indexIn.seek(this.offsetToFooter);
    }

    /**
     * Writes the current index.
     */
    public void WriteIndex() throws IOException {
      this.indexTuple.set(1, this.numberOfTuples);
      DataReaderWriter.writeDatum(this.indexOut, this.indexTuple);
    }

    /**
     * Extracts the index key from the index tuple
     */
    public Tuple getIndexKeyTuple(Tuple indexTuple) throws IOException {
      if (indexTuple.size() == 3)
        return (Tuple)indexTuple.get(0);
      else
        throw new IOException("Invalid index record with size " + indexTuple.size());
    }

    /**
     * Extracts the number of records that share the current key from the index tuple.
     */
    public long getIndexKeyTupleCount(Tuple indexTuple) throws IOException {
      if (indexTuple.size() == 3)
        return (Long)indexTuple.get(1);
      else
        throw new IOException("Invalid index record with size " + indexTuple.size());
    }

    /**
     * Extracts the offset into the data file from the index tuple.
     */
    public long getOffset(Tuple indexTuple) throws IOException {
      if (indexTuple.size() == 3)
        return (Long)indexTuple.get(2);
      else
        throw new IOException("Invalid index record with size " + indexTuple.size());
    }
 
    /**
      * Reads the next index from the index file (or null if EOF) and extracts
     * the index fields.
     */ 
    public Tuple ReadIndex() throws IOException {
      if (this.indexIn.getPos() < this.offsetToFooter) {
        indexTuple = (Tuple)DataReaderWriter.readDatum(this.indexIn);
        if (indexTuple != null) {
          this.lastIndexKeyTuple = this.getIndexKeyTuple(indexTuple);
          this.numberOfTuples = this.getIndexKeyTupleCount(indexTuple);
        }
        return indexTuple;
      }
      return null;
    }

    /**
     * Scans the index looking for a given key.
     * @return the matching index tuple OR the last index tuple
     * greater than the requested key if no match is found.
     */
    public Tuple ScanIndex(Tuple keys) throws IOException {
      if (lastIndexKeyTuple != null && keys.compareTo(this.lastIndexKeyTuple) <= 0) {
        return indexTuple;
      }

      /* Scan the index looking for given key */
      while ((indexTuple = this.ReadIndex()) != null) {
        if (keys.compareTo(this.lastIndexKeyTuple) > 0)
          continue;
        else
          break;
      }
     
      return indexTuple;
    }

    /**
     * stores the list of record indices that identify keys.
     */
    private int[] offsetsToIndexKeys = null;

    /**
     * offset in bytes to the start of the footer of the index.
     */
    private long offsetToFooter = -1;

    /**
     * output stream when writing the index.
     */
    FSDataOutputStream indexOut;

    /**
     * input stream when reading the index.
     */
    FSDataInputStream indexIn;

    /**
     * Tuple factory to create index tuples
     */
    private TupleFactory tupleFactory = TupleFactory.getInstance();

    /**
     * Index key tuple of the form
     * ((Tuple of index keys), count of tuples with index keys, offset to first tuple with index keys)
     */
    private Tuple indexTuple = tupleFactory.newTuple(3);

    /**
     * "Smallest" index key tuple seen
     */
    private Tuple minIndexKeyTuple = null;

    /**
     * "Biggest" index key tuple seen
     */
    private Tuple maxIndexKeyTuple = null;

    /**
     * Last seen index key tuple
     */
    private Tuple lastIndexKeyTuple = null;

    /**
     * Number of tuples seen for a index key
     */
    private long numberOfTuples = 0;

    /**
      * The index file.
     */
    private FileStatus indexFile;
  }

  /**
   * Internal InputFormat class
   */
  public static class IndexedStorageInputFormat extends PigTextInputFormat {

    @Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
      IndexManager im = null;
      try {
        FileSystem fs = FileSystem.get(context.getConfiguration());
        Path indexFile = IndexManager.getIndexFileName(((FileSplit)split).getPath());
        im = new IndexManager(fs.getFileStatus(indexFile));
        im.openIndexFile(fs);
        im.ReadIndexHeader();
        im.ReadIndexFooter();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }

      return new IndexedStorageRecordReader(im);
    }
   
    @Override
    public boolean isSplitable(JobContext context, Path filename) {
      return false;
    }

    /**
     * Internal RecordReader class
      */
    public static class IndexedStorageRecordReader extends RecordReader<LongWritable, Text> {
      private long start;
      private long pos;
      private long end;
      private IndexedStorageLineReader in;
      private int maxLineLength;
      private LongWritable key = null;
      private Text value = null;
      private IndexManager indexManager = null;

      @Override
      public String toString() {
        return indexManager.minIndexKeyTuple + "|" + indexManager.lastIndexKeyTuple + "|" + indexManager.maxIndexKeyTuple;
      }

      public IndexedStorageRecordReader(IndexManager im) {
        this.indexManager = im;
      }

      /**
       * Class to compare record readers using underlying indexes
       *
       */
      public static class IndexedStorageRecordReaderComparator implements Comparator<IndexedStorageRecordReader> {
        @Override
        public int compare(IndexedStorageRecordReader o1, IndexedStorageRecordReader o2) {
          Tuple t1 = (o1.indexManager.lastIndexKeyTuple == null) ?  o1.indexManager.minIndexKeyTuple : o1.indexManager.lastIndexKeyTuple;
          Tuple t2 = (o2.indexManager.lastIndexKeyTuple == null) ?  o2.indexManager.minIndexKeyTuple : o2.indexManager.lastIndexKeyTuple;
          return t1.compareTo(t2);
        }
      }

      public static class IndexedStorageLineReader {
        private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
        private int bufferSize = DEFAULT_BUFFER_SIZE;
        private InputStream in;
        private byte[] buffer;
        // the number of bytes of real data in the buffer
        private int bufferLength = 0;
        // the current position in the buffer
        private int bufferPosn = 0;
        private long bufferOffset = 0;

        private static final byte CR = '\r';
        private static final byte LF = '\n';

        /**
         * Create a line reader that reads from the given stream using the
         * default buffer-size (64k).
         * @param in The input stream
         * @throws IOException
         */
        public IndexedStorageLineReader(InputStream in) {
          this(in, DEFAULT_BUFFER_SIZE);
        }

        /**
         * Create a line reader that reads from the given stream using the
         * given buffer-size.
         * @param in The input stream
         * @param bufferSize Size of the read buffer
         * @throws IOException
         */
        public IndexedStorageLineReader(InputStream in, int bufferSize) {
          if( !(in instanceof Seekable) || !(in instanceof PositionedReadable) ) {
                throw new IllegalArgumentException(
                    "In is not an instance of Seekable or PositionedReadable");
          }
         
          this.in = in;
          this.bufferSize = bufferSize;
          this.buffer = new byte[this.bufferSize];
        }

        /**
         * Create a line reader that reads from the given stream using the
         * <code>io.file.buffer.size</code> specified in the given
         * <code>Configuration</code>.
         * @param in input stream
         * @param conf configuration
         * @throws IOException
         */
        public IndexedStorageLineReader(InputStream in, Configuration conf) throws IOException {
          this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
        }

        /**
         * Close the underlying stream.
         * @throws IOException
         */
        public void close() throws IOException {
          in.close();
        }

        /**
         * Read one line from the InputStream into the given Text.  A line
         * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
         * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
         * line.
         *
         * @param str the object to store the given line (without newline)
         * @param maxLineLength the maximum number of bytes to store into str;
         *  the rest of the line is silently discarded.
         * @param maxBytesToConsume the maximum number of bytes to consume
         *  in this call.  This is only a hint, because if the line cross
         *  this threshold, we allow it to happen.  It can overshoot
         *  potentially by as much as one buffer length.
         *
         * @return the number of bytes read including the (longest) newline
         * found.
         *
         * @throws IOException if the underlying stream throws
         */
        public int readLine(Text str, int maxLineLength,
            int maxBytesToConsume) throws IOException {
          /* We're reading data from in, but the head of the stream may be
           * already buffered in buffer, so we have several cases:
           * 1. No newline characters are in the buffer, so we need to copy
           *    everything and read another buffer from the stream.
           * 2. An unambiguously terminated line is in buffer, so we just
           *    copy to str.
           * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
           *    in CR.  In this case we copy everything up to CR to str, but
           *    we also need to see what follows CR: if it's LF, then we
           *    need consume LF as well, so next call to readLine will read
           *    from after that.
           * We use a flag prevCharCR to signal if previous character was CR
           * and, if it happens to be at the end of the buffer, delay
           * consuming it until we have a chance to look at the char that
           * follows.
           */
          str.clear();
          int txtLength = 0; //tracks str.getLength(), as an optimization
          int newlineLength = 0; //length of terminating newline
          boolean prevCharCR = false; //true of prev char was CR
          long bytesConsumed = 0;
          do {
            int startPosn = bufferPosn; //starting from where we left off the last time
            if (bufferPosn >= bufferLength) {
              startPosn = bufferPosn = 0;
              if (prevCharCR)
                ++bytesConsumed; //account for CR from previous read
             
              bufferOffset = ((Seekable)in).getPos();
              bufferLength = in.read(buffer);
             
              if (bufferLength <= 0)
                break; // EOF
            }
            for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
              if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
              }
              if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
              }
              prevCharCR = (buffer[bufferPosn] == CR);
            }
            int readLength = bufferPosn - startPosn;
            if (prevCharCR && newlineLength == 0)
              --readLength; //CR at the end of the buffer
            bytesConsumed += readLength;
            int appendLength = readLength - newlineLength;
            if (appendLength > maxLineLength - txtLength) {
              appendLength = maxLineLength - txtLength;
            }
            if (appendLength > 0) {
              str.append(buffer, startPosn, appendLength);
              txtLength += appendLength;
            }
          } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

          if (bytesConsumed > (long)Integer.MAX_VALUE)
            throw new IOException("Too many bytes before newline: " + bytesConsumed);   
          return (int)bytesConsumed;
        }

        /**
         * Read from the InputStream into the given Text.
         * @param str the object to store the given line
         * @param maxLineLength the maximum number of bytes to store into str.
         * @return the number of bytes read including the newline
         * @throws IOException if the underlying stream throws
         */
        public int readLine(Text str, int maxLineLength) throws IOException {
          return readLine(str, maxLineLength, Integer.MAX_VALUE);
        }

        /**
         * Read from the InputStream into the given Text.
         * @param str the object to store the given line
         * @return the number of bytes read including the newline
         * @throws IOException if the underlying stream throws
         */
        public int readLine(Text str) throws IOException {
          return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
        }
       
        /**
         * If given offset is within the buffer, adjust the buffer position to read from
         * otherwise seek to the given offset from start of the file.
         * @param offset
         * @throws IOException
         */
        public void seek(long offset) throws IOException {
          if ((offset >= bufferOffset) && (offset < (bufferOffset + bufferLength)))
            bufferPosn = (int) (offset - bufferOffset);
          else {
            bufferPosn = bufferLength;
            ((Seekable)in).seek(offset);
          }
        }
      }

      @Override
      public void initialize(InputSplit genericSplit, TaskAttemptContext context)
      throws IOException, InterruptedException {

        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
        start = split.getStart();
        end = start + split.getLength();
        final Path file = split.getPath();

        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());
        boolean skipFirstLine = false;
        if (start != 0) {
          skipFirstLine = true;
          --start;
          fileIn.seek(start);
        }
        in = new IndexedStorageLineReader(fileIn, job);
        if (skipFirstLine) {
          start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start));
        }
        this.pos = start;
      }

      public void seek(long offset) throws IOException {
        in.seek(offset);
        pos = offset;
      }

      /**
       * Scan the index for given key and seek to appropriate offset in the data
       * @param keys to look for
       * @return true if the given key was found, false otherwise
       * @throws IOException
       */
      public boolean seekNear(Tuple keys) throws IOException {
        boolean ret = false;
        Tuple indexTuple = this.indexManager.ScanIndex(keys);
        if (indexTuple != null) {
          long offset = this.indexManager.getOffset(indexTuple) ;
          in.seek(offset);
       
          if (keys.compareTo(this.indexManager.getIndexKeyTuple(indexTuple)) == 0) {
            ret = true;
          }
        }
       
        return ret;
      }

      @Override
      public boolean nextKeyValue() throws IOException,
      InterruptedException {
        if (key == null) {
          key = new LongWritable();
        }
        key.set(pos);
        if (value == null) {
          value = new Text();
        }
        int newSize = 0;
        while (pos < end) {
          newSize = in.readLine(value, maxLineLength,
              Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
                  maxLineLength));
          if (newSize == 0) {
            break;
          }
          pos += newSize;
          if (newSize < maxLineLength) {
            break;
          }
        }
        if (newSize == 0) {
          key = null;
          value = null;
          return false;
        } else {
          return true;
        }
      }

      @Override
      public LongWritable getCurrentKey() throws IOException,
      InterruptedException {
        return key;
      }

      @Override
      public Text getCurrentValue() throws IOException,
      InterruptedException {
        return value;
      }

      @Override
      public float getProgress() throws IOException, InterruptedException {
        if (start == end) {
          return 0.0f;
        } else {
          return Math.min(1.0f, (pos - start) / (float)(end - start));
        }
      }

      @Override
      public void close() throws IOException {
        if (in != null) {
          in.close();
        }
      }
    }
  }


  /**
   * List of record readers.
   */
  protected IndexedStorageRecordReader[] readers = null

  /**
   * Index into the the list of readers to the current reader. 
    * Readers before this index have been fully scanned for keys.
   */
  protected int currentReaderIndexStart = 0;

  /**
   * Delimiter to use between fields
   */
  protected byte fieldDelimiter = '\t';

  /**
   * Offsets to index keys in tuple
   */
  final protected int[] offsetsToIndexKeys;

  /**
   * Comparator used to compare key tuples.
   */
  protected Comparator<IndexedStorageRecordReader> readerComparator = new IndexedStorageInputFormat.IndexedStorageRecordReader.IndexedStorageRecordReaderComparator();
}
TOP

Related Classes of org.apache.pig.piggybank.storage.IndexedStorageInputFormat$IndexedStorageRecordReader$IndexedStorageLineReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.