Source Code of com.linkedin.pig.TrevniStorageInputFormat

package com.linkedin.pig;


import java.io.IOException;
import java.io.OutputStream;
import java.util.Formatter;
import java.util.List;


import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.LoadPushDown;
import org.apache.pig.ResourceSchema;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.trevni.ColumnFileMetaData;
import org.apache.trevni.avro.AvroColumnReader;
import org.apache.trevni.avro.AvroColumnWriter;
import org.apache.trevni.avro.AvroTrevniOutputFormat;
import org.apache.trevni.avro.HadoopInput;


import com.google.common.collect.Lists;


/**
 * @author jadler@linkedin.com
 *
 * Pig Store/Load Function for Trevni
 *
 */


public class TrevniStorage extends FastAvroStorage implements LoadPushDown{


  /*
   * (non-Javadoc)
   * 
   * @see org.apache.pig.LoadPushDown#getFeatures()
   */


  public TrevniStorage() {    
  }
  
  public TrevniStorage(String[] vars) {
    super(vars);
  }
  
  public static final PathFilter visibleTrevniFiles = new PathFilter() {
    @Override
    public boolean accept(Path p) {
      return (!p.getName().startsWith("_") &&
          p.getName().endsWith(org.apache.trevni.avro.AvroTrevniOutputFormat.EXT));
    }
  };
  
  
  @Override
  public List<OperatorSet> getFeatures() {
    return Lists.newArrayList(LoadPushDown.OperatorSet.PROJECTION);
  }


  /*
   * (non-Javadoc)
   * 
   * @see
   * org.apache.pig.LoadPushDown#pushProjection(org.apache.pig.LoadPushDown.
   * RequiredFieldList)
   */


  RequiredFieldList requiredFieldList;


  private void requiredFieldListPrinter(List<RequiredField> rfl, int level) {
    for (RequiredField rf: rfl) {
      for (int i=0; i <=level; i++)
        System.err.print("\t");
      System.err.printf("%s\n",rf.getAlias());
      if (rf.getSubFields() != null)
        requiredFieldListPrinter(rf.getSubFields(),level+1);
    }
  }
  
  @Override
  public RequiredFieldResponse pushProjection(RequiredFieldList rfl) throws FrontendException {
    requiredFieldList = rfl;
    System.err.printf("Requested fields:\n");
    requiredFieldListPrinter(rfl.getFields(), 0);
    
    Schema newSchema = FastAvroStorageCommon.newSchemaFromRequiredFieldList(schema, rfl);
    if (newSchema != null) {
      schema = newSchema;
      setInputAvroSchema(schema);
      System.err.printf("Only grabbing subset %s\n", getInputAvroSchema());
      return new RequiredFieldResponse(true);
    } else{
      System.err.printf("could not select fields subset %s\n", rfl);
      return new RequiredFieldResponse(false);
    }
  }
    
  /*
   * (non-Javadoc)
   * 
   * @see org.apache.pig.LoadFunc#getInputFormat()
   */
  @Override
  public InputFormat<NullWritable, GenericData.Record> getInputFormat() throws IOException {


      class TrevniStorageInputFormat extends PigFileInputFormat<NullWritable,GenericData.Record> {


        //TrevniStorageInputFormat() {}
        
        @Override protected boolean isSplitable(JobContext jc, Path p) {
          return false;
        }
        
        @Override protected  List<FileStatus> listStatus(JobContext job) throws IOException {
          List<FileStatus> results = Lists.newArrayList();
          job.getConfiguration().setBoolean("mapred.input.dir.recursive", true);          
          for (FileStatus file : super.listStatus(job))
            if (visibleTrevniFiles.accept(file.getPath()))
              results.add(file);
          return results;
        }
        
        @Override
        public RecordReader<NullWritable, GenericData.Record> createRecordReader(InputSplit is, TaskAttemptContext tc)
            throws IOException, InterruptedException {          
          RecordReader<NullWritable, GenericData.Record> rr =  
              new RecordReader<NullWritable, GenericData.Record>() {


            private FileSplit fsplit;
            private AvroColumnReader.Params params;
            private AvroColumnReader<GenericData.Record> reader;
            private float rows;
            private long row = 0;
            private GenericData.Record currentRecord = null;
            
            @Override
            public void close() throws IOException {
              reader.close();
            }


            @Override
            public NullWritable getCurrentKey() throws IOException, InterruptedException {
              return NullWritable.get();
            }


            @Override
            public Record getCurrentValue() throws IOException, InterruptedException {
              return currentRecord;
            }


            @Override
            public float getProgress() throws IOException, InterruptedException {
              return row / rows;
            }


            @Override
            public void initialize(InputSplit isplit, TaskAttemptContext tac) throws IOException, InterruptedException {
              // TODO Auto-generated method stub
              fsplit = (FileSplit) isplit;
              params = new AvroColumnReader.Params(
                  new HadoopInput(fsplit.getPath(), tac.getConfiguration()));
              Schema inputSchema = getInputAvroSchema();
              System.err.printf("initializing RecordReader with schema %s\n", inputSchema);
              params.setSchema(inputSchema);
              reader = new AvroColumnReader<GenericData.Record>(params);
              rows = reader.getRowCount();
            }


            @Override
            public boolean nextKeyValue() throws IOException, InterruptedException {
              if (reader.hasNext()) {
                currentRecord = reader.next();
                row++;
                return true;
              } else {
                return false;
              }
                
            }};
            
            rr.initialize(is, tc);
            tc.setStatus(is.toString());
            return rr;
        }
        
      }
      
      return new TrevniStorageInputFormat();
  
  }
  
  /*
   * (non-Javadoc)
   * 
   * @see org.apache.pig.StoreFuncInterface#getOutputFormat()
   */
  @Override
  public OutputFormat<NullWritable, Object> getOutputFormat() throws IOException {
    class TrevniStorageOutputFormat extends
      org.apache.hadoop.mapreduce.lib.output.FileOutputFormat<NullWritable, Object> {


      private Schema schema;


      TrevniStorageOutputFormat(Schema s) {
        schema = s;
        if (s == null) {
          String schemaString = getProperties().getProperty(OUTPUT_AVRO_SCHEMA);
          if (schemaString != null)
            schema = (new Schema.Parser()).parse(schemaString);
        }


      }


      @Override
      public RecordWriter<NullWritable, Object> getRecordWriter(final TaskAttemptContext tc) throws IOException,
      InterruptedException {


        if (schema == null) {
          String schemaString = getProperties(ResourceSchema.class, null).getProperty(OUTPUT_AVRO_SCHEMA);
          if (schemaString != null)
            schema = (new Schema.Parser()).parse(schemaString);
        }


        final ColumnFileMetaData meta = new ColumnFileMetaData();
        final Path dir = getOutputPath(tc);
        final FileSystem fs = FileSystem.get(tc.getConfiguration());
        final long blockSize = fs.getDefaultBlockSize();
        
        if (!fs.mkdirs(dir))
          throw new IOException("Failed to create directory: " + dir);


        meta.setCodec("snappy");
        
        return new RecordWriter<NullWritable, Object>() {
          private int part = 0;
          private AvroColumnWriter<GenericData.Record> writer =
              new AvroColumnWriter<GenericData.Record>(schema, meta);


          private void flush() throws IOException {
            Integer taskAttemptId = tc.getTaskAttemptID().getTaskID().getId();
            String partName = String.format("%05d_%03d", taskAttemptId, part++);
            OutputStream out = fs.create(new Path(dir, "part-"+partName+AvroTrevniOutputFormat.EXT));
            try {
              writer.writeTo(out);
            } finally {
              out.close();
            }
            writer = new AvroColumnWriter<GenericData.Record>(schema, meta);
          }
          
          @Override
          public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
            flush();
          }


          @Override
          public void write(NullWritable n, Object o) throws IOException, InterruptedException {
            GenericData.Record r = FastAvroStorageCommon.packIntoAvro((Tuple)o,schema);
            writer.write(r);
            if (writer.sizeEstimate() >= blockSize)
              flush();
          }
          
        };        
      }
    }


    return new TrevniStorageOutputFormat(schema);
  }
  
  @Override
  public  Schema getAvroSchema(Path p, Job job) throws IOException {


    FileSystem fs = FileSystem.get(job.getConfiguration());


    if (!fs.exists(p))
      throw new IOException("Path " + p + " does not exist");    
    
    while (fs.getFileStatus(p).isDir()) {
      FileStatus statusArray[] = fs.listStatus(p, visibleFiles);
      if (statusArray != null && statusArray.length > 0)
        p = statusArray[0].getPath();
    }


    if (visibleTrevniFiles.accept(p)) {
      
      AvroColumnReader.Params params =
          new AvroColumnReader.Params(new HadoopInput(p, job.getConfiguration()));
       AvroColumnReader<GenericData.Record> reader = 
          new AvroColumnReader<GenericData.Record>(params);
       return reader.getFileSchema();
    } else
      throw new IOException("overly simple file search didn't find a trevni file");
  }
  
}
Source Code of com.linkedin.pig.TrevniStorageInputFormat

Related Classes of com.linkedin.pig.TrevniStorageInputFormat