Package com.datasalt.pangool.tuplemr.mapred.lib.input

Source Code of com.datasalt.pangool.tuplemr.mapred.lib.input.CascadingTupleInputFormat

package com.datasalt.pangool.tuplemr.mapred.lib.input;

import static cascading.tuple.hadoop.TupleSerializationProps.HADOOP_IO_SERIALIZATIONS;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.serializer.WritableSerialization;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cascading.tuple.hadoop.TupleSerialization;
import cascading.util.Util;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Tuple;

/**
* A wrapper around a SequenceFile that contains Cascading's Tuples that implements a Pangool-friendly InputFormat.
* The Schema is lazily discovered with the first seen Cascading Tuple. The type correspondence is:
* <ul>
*   <li>Integer - INT</li>
*   <li>Long - LONG</li>
*   <li>Float - FLOAT</li>
*   <li>Double - DOUBLE</li>
*   <li>String - STRING</li>
*   <li>Short - INT</li>
*   <li>Boolean - BOOLEAN</li>
* </ul>
* Any other type is unrecognized and an IOException is thrown.
* <p>
* Column names must be provided to the InputFormat, this is of course because Cascading doesn't save them anywhere.
* The schemaName is used to instantiate a Pangool Schema.
* <p>
* Note that for this to work Cascading serialization must have been enabled in Hadoop Configuration.
* You can do this by calling static method {@link #setSerializations(Configuration)}.
*/
@SuppressWarnings("serial")
public class CascadingTupleInputFormat extends FileInputFormat<ITuple, NullWritable> implements Serializable {

  private static Logger log = LoggerFactory.getLogger(CascadingTupleInputFormat.class);
 
  private String schemaName;
  private String[] fieldNames;

  public CascadingTupleInputFormat(String schemaName, String... fieldNames) {
    this.schemaName = schemaName;
    this.fieldNames = fieldNames;
  }

  /**
   * Like in Cascading's TupleSerialization.setSerializations() but accepting a Hadoop's Configuration rather than JobConf.
   */
  public static void setSerializations(Configuration conf) {
    String serializations = conf.get(HADOOP_IO_SERIALIZATIONS);

    LinkedList<String> list = new LinkedList<String>();

    if(serializations != null && !serializations.isEmpty())
      Collections.addAll(list, serializations.split(","));

    // required by MultiInputSplit
    String writable = WritableSerialization.class.getName();
    String tuple = TupleSerialization.class.getName();

    list.remove(writable);
    list.remove(tuple);

    list.addFirst(writable);
    list.addFirst(tuple);

    // make writable last
    conf.set(HADOOP_IO_SERIALIZATIONS, Util.join(list, ","));
  }
 
  @Override
  public RecordReader<ITuple, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx)
      throws IOException, InterruptedException {

    return new RecordReader<ITuple, NullWritable>() {

      RecordReader<cascading.tuple.Tuple, cascading.tuple.Tuple> delegatingRecordReader;
      ITuple tuple;

      @Override
      public void close() throws IOException {
      }

      @Override
      public ITuple getCurrentKey() throws IOException, InterruptedException {
        cascading.tuple.Tuple cTuple = delegatingRecordReader.getCurrentValue();
       
        if(tuple == null) {
          int i = 0;
          List<Field> fields = new ArrayList<Field>();
         
          for(Class<?> cl : cTuple.getTypes()) {
            if(cl.equals(Integer.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.INT, true));
            } else if(cl.equals(Long.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.LONG, true));
            } else if(cl.equals(Float.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.FLOAT, true));
            } else if(cl.equals(Double.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.DOUBLE, true));
            } else if(cl.equals(String.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.STRING, true));
            } else if(cl.equals(Boolean.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.BOOLEAN, true));
            } else if(cl.equals(Short.class)) {
              fields.add(Field.create(fieldNames[i], Field.Type.INT, true));
            } else {
              throw new IOException("Can't handle type [" + cl + "] - only primitive Java types allowed.");
            }
            i++;
          }
          Schema schema = new Schema(schemaName, fields);
          log.info("Lazily instantiated a Pangool Schema from Cascading Tuple: [" + schema + "]");
          tuple = new Tuple(schema);
        }
       
        // Just perform a normal Object copying - without checking Schema everytime.
        // This is more efficient but it will raise errors later.
        for(int i = 0; i < tuple.getSchema().getFields().size(); i++) {
          tuple.set(i, cTuple.getObject(i));
        }
       
        return tuple;
      }

      @Override
      public NullWritable getCurrentValue() throws IOException, InterruptedException {
        return NullWritable.get();
      }

      @Override
      public float getProgress() throws IOException, InterruptedException {
        return delegatingRecordReader.getProgress();
      }

      @Override
      public void initialize(InputSplit iS, TaskAttemptContext ctx) throws IOException,
          InterruptedException {
        delegatingRecordReader = new SequenceFileInputFormat<cascading.tuple.Tuple, cascading.tuple.Tuple>()
            .createRecordReader(iS, ctx);
        delegatingRecordReader.initialize(iS, ctx);
      }

      @Override
      public boolean nextKeyValue() throws IOException, InterruptedException {
        return delegatingRecordReader.nextKeyValue();
      }
    };
  }
}
TOP

Related Classes of com.datasalt.pangool.tuplemr.mapred.lib.input.CascadingTupleInputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.