Package org.apache.howl.rcfile

Source Code of org.apache.howl.rcfile.RCFileInputDriver

package org.apache.howl.rcfile;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe;
import org.apache.hadoop.hive.serde2.columnar.ColumnarStruct;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.howl.common.HowlUtil;
import org.apache.howl.data.DefaultHowlRecord;
import org.apache.howl.data.HowlRecord;
import org.apache.howl.data.schema.HowlFieldSchema;
import org.apache.howl.data.schema.HowlSchema;
import org.apache.howl.mapreduce.HowlInputStorageDriver;

public class RCFileInputDriver extends HowlInputStorageDriver{


  private SerDe serde;
  private static final Log LOG = LogFactory.getLog(RCFileInputDriver.class);
  private List<HowlFieldSchema> colsInData;
  private StructObjectInspector oi;
  private Map<String,String> partValues;
  private List<HowlFieldSchema> outCols;
  private List<? extends StructField> structFields;
  private Map<String,Integer> namePosMapping;

  @Override
  public InputFormat<? extends WritableComparable, ? extends Writable> getInputFormat(Properties howlProperties) {
    return new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
  }

  @Override
  public void setInputPath(JobContext jobContext, String location) throws IOException {

    super.setInputPath(jobContext, location);
  }

  @Override
  public void setOriginalSchema(JobContext jobContext, HowlSchema dataSchema) throws IOException {

    colsInData = dataSchema.getFields();
    namePosMapping = new HashMap<String, Integer>(colsInData.size());
    int index =0;
    for(HowlFieldSchema field : dataSchema.getFields()){
      namePosMapping.put(field.getName(), index++);
    }
  }

  @Override
  public void setOutputSchema(JobContext jobContext, HowlSchema desiredSchema) throws IOException {

    // Finds out which column ids needs to be projected and set them up for RCFile.
    outCols = desiredSchema.getFields();
    ArrayList<Integer> prjColumns = new ArrayList<Integer>();
    for(HowlFieldSchema prjCol : outCols){
      Integer pos = namePosMapping.get(prjCol.getName().toLowerCase());
      if(pos != null) {
        prjColumns.add(pos);
      }
    }

    Collections.sort(prjColumns);
    ColumnProjectionUtils.setReadColumnIDs(jobContext.getConfiguration(), prjColumns);
  }

  @Override
  public void setPartitionValues(JobContext jobContext, Map<String, String> partitionValues)
  throws IOException {
    partValues = partitionValues;
  }

  @Override
  public HowlRecord convertToHowlRecord(WritableComparable ignored, Writable bytesRefArray) throws IOException {

    // Deserialize bytesRefArray into struct and then convert that struct to
    // HowlRecord.
    ColumnarStruct struct;
    try {
      struct = (ColumnarStruct)serde.deserialize(bytesRefArray);
    } catch (SerDeException e) {
      LOG.error(e.toString(), e);
      throw new IOException(e);
    }

    List<Object> outList = new ArrayList<Object>(outCols.size());

    String colName;
    Integer index;

    for(HowlFieldSchema col : outCols){

      colName = col.getName().toLowerCase();
      index = namePosMapping.get(colName);

      if(index != null){
        StructField field = structFields.get(index);
        outList.add( getTypedObj(oi.getStructFieldData(struct, field), field.getFieldObjectInspector()));
      }

      else {
        outList.add(partValues.get(colName));
      }

    }
    return new DefaultHowlRecord(outList);
  }

  private Object getTypedObj(Object data, ObjectInspector oi) throws IOException{

    // The real work-horse method. We are gobbling up all the laziness benefits
    // of Hive-RCFile by deserializing everything and creating crisp  HowlRecord
    // with crisp Java objects inside it. We have to do it because higher layer
    // may not know how to do it.

    switch(oi.getCategory()){

    case PRIMITIVE:
      return ((PrimitiveObjectInspector)oi).getPrimitiveJavaObject(data);

    case MAP:
      MapObjectInspector moi = (MapObjectInspector)oi;
      Map<?,?> lazyMap = moi.getMap(data);
      ObjectInspector keyOI = moi.getMapKeyObjectInspector();
      ObjectInspector valOI = moi.getMapValueObjectInspector();
      Map<Object,Object> typedMap = new HashMap<Object,Object>(lazyMap.size());
      for(Entry<?,?> e : lazyMap.entrySet()){
        typedMap.put(getTypedObj(e.getKey(), keyOI), getTypedObj(e.getValue(), valOI));
      }
      return typedMap;

    case LIST:
      ListObjectInspector loi = (ListObjectInspector)oi;
      List<?> lazyList = loi.getList(data);
      ObjectInspector elemOI = loi.getListElementObjectInspector();
      List<Object> typedList = new ArrayList<Object>(lazyList.size());
      Iterator<?> itr = lazyList.listIterator();
      while(itr.hasNext()){
        typedList.add(getTypedObj(itr.next(),elemOI));
      }
      return typedList;

    case STRUCT:
      StructObjectInspector soi = (StructObjectInspector)oi;
      List<? extends StructField> fields = soi.getAllStructFieldRefs();
      List<Object> typedStruct = new ArrayList<Object>(fields.size());
      for(StructField field : fields){
        typedStruct.add( getTypedObj(soi.getStructFieldData(data, field), field.getFieldObjectInspector()));
      }
      return typedStruct;


    default:
      throw new IOException("Don't know how to deserialize: "+oi.getCategory());

    }
  }

  @Override
  public void initialize(JobContext context,Properties howlProperties)
  throws IOException {

    super.initialize(context, howlProperties);

    // Columnar Serde needs to know names and types of columns it needs to read.
    List<FieldSchema> fields = HowlUtil.getFieldSchemaList(colsInData);
    howlProperties.setProperty(Constants.LIST_COLUMNS,MetaStoreUtils.
        getColumnNamesFromFieldSchema(fields));
    howlProperties.setProperty(Constants.LIST_COLUMN_TYPES, MetaStoreUtils.
        getColumnTypesFromFieldSchema(fields));

    // It seems RCFIle reads and writes nulls differently as compared to default hive.
    // setting these props to match LazySimpleSerde
    howlProperties.setProperty(Constants.SERIALIZATION_NULL_FORMAT, "\\N");
    howlProperties.setProperty(Constants.SERIALIZATION_FORMAT, "1");

    try {
      serde = new ColumnarSerDe();
      serde.initialize(context.getConfiguration(), howlProperties);
      oi = (StructObjectInspector) serde.getObjectInspector();
      structFields = oi.getAllStructFieldRefs();

    } catch (SerDeException e) {
      throw new IOException(e);
    }
  }
}
TOP

Related Classes of org.apache.howl.rcfile.RCFileInputDriver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.