Source Code of org.apache.hadoop.mapred.lib.FixedLengthRecordReader

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.mapred.lib;


import java.io.IOException;
import java.io.InputStream;


import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;


import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Seekable;


import org.apache.hadoop.conf.Configuration;


import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.RecordReader;


import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;


import org.apache.hadoop.io.BytesWritable;


/**
 * 
 * FixedLengthRecordReader is returned by FixedLengthInputFormat. This reader
 * uses the record length property set within the FixedLengthInputFormat to 
 * read one record at a time from the given InputSplit. This record reader
 * does not support compressed files.<BR><BR>
 * 
 * Each call to nextKeyValue() updates the LongWritable KEY and BytesWritable 
 * VALUE.<BR><BR>
 * 
 * KEY = (BytesWritable) The KEY is the either the record position (Long as a byte array)
 * within the  InputSplit OR the bytes located between the FixedLengthInputFormat.FIXED_RECORD_KEY_START_AT
 * and FixedLengthInputFormat.FIXED_RECORD_KEY_END_AT property values, if set
 * by the caller when the job was configured.<BR><BR>
 * 
 * VALUE = the record itself (BytesWritable)
 * 
 * @see FixedLengthInputFormat
 *
 */
public class FixedLengthRecordReader 
  implements RecordReader<BytesWritable, BytesWritable> {        


  // reference to the logger
  private static final Log LOG = 
    LogFactory.getLog(FixedLengthRecordReader.class);


  private CompressionCodecFactory compressionCodecs = null;
  private long start;
  private long pos;
  private long end;
  private InputStream in;
  private FSDataInputStream fileIn;
  private final Seekable filePosition;
  private int recordLength;
  private int recordKeyStartAt;
  private int recordKeyEndAt;
  private int recordKeyLength;
  private CompressionCodec codec;
  private Decompressor decompressor;
  
  private byte[] recordBytes;


  // our record key 
  private BytesWritable recordKey = null;


  // the record value
  private BytesWritable recordValue = null; 
  
  public FixedLengthRecordReader(FileSplit split, 
                                 Configuration conf) throws IOException {
    // the size of each fixed length record
    this.recordLength = FixedLengthInputFormat.getRecordLength(conf);
    
    // the start position for each key
    this.recordKeyStartAt = FixedLengthInputFormat.getRecordKeyStartAt(conf);
    
    // the end position for each key
    this.recordKeyEndAt = FixedLengthInputFormat.getRecordKeyEndAt(conf);
    
    // record key length (add 1 because the start/end points are INCLUSIVE)
    this.recordKeyLength = recordKeyEndAt - recordKeyStartAt + 1;
    
    // log some debug info
    LOG.info("FixedLengthRecordReader: recordLength="+this.recordLength);
    LOG.info("FixedLengthRecordReader: " + 
        (this.recordKeyStartAt != -1 ? 
            (" KEY-START-AT=" + this.recordKeyStartAt + 
             " KEY-END-AT=" + this.recordKeyEndAt) : 
             " NO-CUSTOM-KEY-START/END SPECIFIED, KEY will be record " +
             "position in InputSplit"));
                 
    start = split.getStart();
    end = split.getLength() + start;
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    codec = compressionCodecs.getCodec(file);
    
    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(conf);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
          ((SplittableCompressionCodec)codec).createInputStream(
            fileIn, decompressor, start, end,
            SplittableCompressionCodec.READ_MODE.BYBLOCK); 
        in = (InputStream)cIn;
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
        in = (InputStream)codec.createInputStream(fileIn, decompressor);
        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      in = (InputStream)fileIn;
      filePosition = fileIn;
    }
    
    if (start != 0) {
      // seek to the start of the next record.
      long mod = start % this.recordLength;
      long dist = mod != 0 ? 
        (this.recordLength - (start % this.recordLength))
        : 0;
      LOG.info("FixedLengthRecordReader: " + 
        "Starting at offset " + start + " seeking " + dist + 
        " bytes to next record.");
      filePosition.seek(start+dist);
      start += dist;
    }
    this.pos = start;
  }
  
  public BytesWritable createKey() {
      return new BytesWritable(new byte[this.recordKeyLength]);
  }
  
  public BytesWritable createValue() {
      return new BytesWritable(new byte[this.recordLength]);
  }
  
  private boolean isCompressedInput() {
    return (codec != null);
  }


  @Override
  public synchronized void close() throws IOException {
    try {
      if (in != null) {
        in.close();
      }
    } finally {
      if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
      }
    }
  }
  
  public synchronized long getPos() throws IOException {
    return pos;
  }
  
  private long getFilePosition() throws IOException {
    long retVal;
    if (isCompressedInput() && null != filePosition) {
      retVal = filePosition.getPos();
    } else {
      retVal = pos;
    }
    return retVal;
  }


  @Override
  public synchronized float getProgress() throws IOException {
    if (start == end) {
      return 0.0f;
    } else {
      return Math.min(1.0f, (getFilePosition() - start) / 
          (float)(end-start));
    } 
  }
  
 
  /** Read the next record in the input stream. 
   * @param record an array of recordLength bytes.
   * */
  private void readRecord(byte[] record) throws IOException {
    int totalRead = 0; // total bytes read
    int totalToRead = recordLength; // total bytes we need to read


    // while we still have record bytes to read
    while(totalRead != recordLength) {
      // read in what we need
      int read = this.in.read(record, totalRead, totalToRead);


      /* EOF? this is an error because each 
       * split calculated by FixedLengthInputFormat
       * contains complete records, if we receive 
       * an EOF within this loop, then we have
       * only read a partial record as totalRead != recordLength
       */
      if (read == -1) {
          throw new IOException("FixedLengthRecordReader, " +
                  " unexpectedly encountered an EOF when attempting" +
                  " to read in an entire record from the current split");
      }
    
      // read will never be zero, because read is only
      // zero if you pass in zero to the read() call above


      // update our markers
      totalRead += read;
      totalToRead -= read;
    }
  }
  
  @Override
  public synchronized boolean next(BytesWritable key, BytesWritable value)
    throws IOException {
    
    while (getFilePosition() < end) {
      this.readRecord(value.getBytes());
      
      if (recordKeyStartAt != -1 && recordKeyEndAt != -1) {
        key.set(recordValue.getBytes(), this.recordKeyStartAt, this.recordKeyLength);
        
      // otherwise do the default action, (key is record position in the split)
      } else {
        // default is that the the Key is the position the record started at
        byte[] posKey = toBytes(pos);
        key.set(posKey,0,posKey.length);
      }
      
      pos += recordLength;
      
      return true;
    }
    
    return false;
  }
  
  public static byte[] toBytes(long val) {
    byte [] b = new byte[8];
    for(int i=7;i>0;i--) {
      b[i] = (byte)(val);
      val >>>= 8;
    }
    b[0] = (byte)(val);
    return b;
  }
}
Source Code of org.apache.hadoop.mapred.lib.FixedLengthRecordReader

Related Classes of org.apache.hadoop.mapred.lib.FixedLengthRecordReader