Source Code of org.apache.drill.exec.store.hive.HiveRecordReader

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.hive;


import java.io.IOException;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.List;
import java.util.Map;
import java.util.Properties;


import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.store.AbstractRecordReader;
import org.apache.drill.exec.vector.BigIntVector;
import org.apache.drill.exec.vector.BitVector;
import org.apache.drill.exec.vector.DateVector;
import org.apache.drill.exec.vector.Float4Vector;
import org.apache.drill.exec.vector.Float8Vector;
import org.apache.drill.exec.vector.IntVector;
import org.apache.drill.exec.vector.SmallIntVector;
import org.apache.drill.exec.vector.TimeStampVector;
import org.apache.drill.exec.vector.TinyIntVector;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.VarBinaryVector;
import org.apache.drill.exec.vector.VarCharVector;
import org.apache.drill.exec.vector.allocator.VectorAllocator;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;


import com.google.common.collect.Lists;


public class HiveRecordReader extends AbstractRecordReader {


  protected Table table;
  protected Partition partition;
  protected InputSplit inputSplit;
  protected FragmentContext context;
  protected List<String> selectedColumnNames;
  protected List<TypeInfo> selectedColumnTypes = Lists.newArrayList();
  protected List<ObjectInspector> selectedColumnObjInspectors = Lists.newArrayList();
  protected List<HiveFieldConverter> selectedColumnFieldConverters = Lists.newArrayList();
  protected List<String> selectedPartitionNames = Lists.newArrayList();
  protected List<TypeInfo> selectedPartitionTypes = Lists.newArrayList();
  protected List<Object> selectedPartitionValues = Lists.newArrayList();
  protected List<String> tableColumns; // all columns in table (not including partition columns)
  protected SerDe serde;
  protected StructObjectInspector sInspector;
  protected Object key, value;
  protected org.apache.hadoop.mapred.RecordReader reader;
  protected List<ValueVector> vectors = Lists.newArrayList();
  protected List<ValueVector> pVectors = Lists.newArrayList();
  protected Object redoRecord;
  protected boolean empty;
  private Map<String, String> hiveConfigOverride;
  private FragmentContext fragmentContext;
  private OperatorContext operatorContext;




  protected static final int TARGET_RECORD_COUNT = 4000;
  protected static final int FIELD_SIZE = 50;


  public HiveRecordReader(Table table, Partition partition, InputSplit inputSplit, List<SchemaPath> projectedColumns,
      FragmentContext context, Map<String, String> hiveConfigOverride) throws ExecutionSetupException {
    this.table = table;
    this.partition = partition;
    this.inputSplit = inputSplit;
    this.context = context;
    this.empty = (inputSplit == null && partition == null);
    this.hiveConfigOverride = hiveConfigOverride;
    this.fragmentContext=context;
    setColumns(projectedColumns);
    init();
  }


  private void init() throws ExecutionSetupException {
    Properties properties;
    JobConf job = new JobConf();
    if (partition != null) {
      properties = MetaStoreUtils.getPartitionMetadata(partition, table);


      // SerDe expects properties from Table, but above call doesn't add Table properties.
      // Include Table properties in final list in order to not to break SerDes that depend on
      // Table properties. For example AvroSerDe gets the schema from properties (passed as second argument)
      for (Map.Entry<String, String> entry : table.getParameters().entrySet()) {
        if (entry.getKey() != null && entry.getKey() != null) {
          properties.put(entry.getKey(), entry.getValue());
        }
      }
    } else {
      properties = MetaStoreUtils.getTableMetadata(table);
    }
    for (Object obj : properties.keySet()) {
      job.set((String) obj, (String) properties.get(obj));
    }
    for(Map.Entry<String, String> entry : hiveConfigOverride.entrySet()) {
      job.set(entry.getKey(), entry.getValue());
    }
    InputFormat format;
    String sLib = (partition == null) ? table.getSd().getSerdeInfo().getSerializationLib() : partition.getSd().getSerdeInfo().getSerializationLib();
    String inputFormatName = (partition == null) ? table.getSd().getInputFormat() : partition.getSd().getInputFormat();
    try {
      format = (InputFormat) Class.forName(inputFormatName).getConstructor().newInstance();
      Class c = Class.forName(sLib);
      serde = (SerDe) c.getConstructor().newInstance();
      serde.initialize(job, properties);
    } catch (ReflectiveOperationException | SerDeException e) {
      throw new ExecutionSetupException("Unable to instantiate InputFormat", e);
    }
    job.setInputFormat(format.getClass());


    List<FieldSchema> partitionKeys = table.getPartitionKeys();
    List<String> partitionNames = Lists.newArrayList();
    for (FieldSchema field : partitionKeys) {
      partitionNames.add(field.getName());
    }


    try {
      ObjectInspector oi = serde.getObjectInspector();
      if (oi.getCategory() != ObjectInspector.Category.STRUCT) {
        throw new UnsupportedOperationException(String.format("%s category not supported", oi.getCategory()));
      }
      sInspector = (StructObjectInspector) oi;
      StructTypeInfo sTypeInfo = (StructTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(sInspector);
      if (isStarQuery()) {
        selectedColumnNames = sTypeInfo.getAllStructFieldNames();
        tableColumns = selectedColumnNames;
      } else {
        tableColumns = sTypeInfo.getAllStructFieldNames();
        List<Integer> columnIds = Lists.newArrayList();
        selectedColumnNames = Lists.newArrayList();
        for (SchemaPath field : getColumns()) {
          String columnName = field.getRootSegment().getPath();
          if (!tableColumns.contains(columnName)) {
            if (partitionNames.contains(columnName)) {
              selectedPartitionNames.add(columnName);
            } else {
              throw new ExecutionSetupException(String.format("Column %s does not exist", columnName));
            }
          } else {
            columnIds.add(tableColumns.indexOf(columnName));
            selectedColumnNames.add(columnName);
          }
        }
        ColumnProjectionUtils.appendReadColumnIDs(job, columnIds);
        ColumnProjectionUtils.appendReadColumnNames(job, selectedColumnNames);
      }


      for (String columnName : selectedColumnNames) {
        ObjectInspector fieldOI = sInspector.getStructFieldRef(columnName).getFieldObjectInspector();
        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(fieldOI.getTypeName());


        selectedColumnObjInspectors.add(fieldOI);
        selectedColumnTypes.add(typeInfo);
        selectedColumnFieldConverters.add(HiveFieldConverter.create(typeInfo));
      }


      if (isStarQuery()) {
        selectedPartitionNames = partitionNames;
      }


      for (int i = 0; i < table.getPartitionKeys().size(); i++) {
        FieldSchema field = table.getPartitionKeys().get(i);
        if (selectedPartitionNames.contains(field.getName())) {
          TypeInfo pType = TypeInfoUtils.getTypeInfoFromTypeString(field.getType());
          selectedPartitionTypes.add(pType);


          if (partition != null) {
            selectedPartitionValues.add(convertPartitionType(pType, partition.getValues().get(i)));
          }
        }
      }
    } catch (Exception e) {
      throw new ExecutionSetupException("Failure while initializing HiveRecordReader: " + e.getMessage(), e);
    }


    if (!empty) {
      try {
        reader = format.getRecordReader(inputSplit, job, Reporter.NULL);
      } catch (IOException e) {
        throw new ExecutionSetupException("Failed to get o.a.hadoop.mapred.RecordReader from Hive InputFormat", e);
      }
      key = reader.createKey();
      value = reader.createValue();
    }
  }


  public OperatorContext getOperatorContext() {
    return operatorContext;
  }


  public void setOperatorContext(OperatorContext operatorContext) {
    this.operatorContext = operatorContext;
  }


  @Override
  public void setup(OutputMutator output) throws ExecutionSetupException {
    try {
      for (int i = 0; i < selectedColumnNames.size(); i++) {
        MajorType type = Types.optional(getMinorTypeFromHiveTypeInfo(selectedColumnTypes.get(i)));
        MaterializedField field = MaterializedField.create(SchemaPath.getSimplePath(selectedColumnNames.get(i)), type);
        Class vvClass = TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode());
        vectors.add(output.addField(field, vvClass));
      }


      for (int i = 0; i < selectedPartitionNames.size(); i++) {
        MajorType type = Types.required(getMinorTypeFromHiveTypeInfo(selectedPartitionTypes.get(i)));
        MaterializedField field = MaterializedField.create(SchemaPath.getSimplePath(selectedPartitionNames.get(i)), type);
        Class vvClass = TypeHelper.getValueVectorClass(field.getType().getMinorType(), field.getDataMode());
        pVectors.add(output.addField(field, vvClass));
      }
    } catch(SchemaChangeException e) {
      throw new ExecutionSetupException(e);
    }
  }


  @Override
  public int next() {
    for (ValueVector vv : vectors) {
      VectorAllocator.getAllocator(vv, FIELD_SIZE).alloc(TARGET_RECORD_COUNT);
    }
    if (empty) {
      setValueCountAndPopulatePartitionVectors(0);
      return 0;
    }


    try {
      int recordCount = 0;


      if (redoRecord != null) {
        // Try writing the record that didn't fit into the last RecordBatch
        Object deSerializedValue = serde.deserialize((Writable) redoRecord);
        boolean status = readHiveRecordAndInsertIntoRecordBatch(deSerializedValue, recordCount);
        if (!status) {
          throw new DrillRuntimeException("Current record is too big to fit into allocated ValueVector buffer");
        }
        redoRecord = null;
        recordCount++;
      }


      while (recordCount < TARGET_RECORD_COUNT && reader.next(key, value)) {
        Object deSerializedValue = serde.deserialize((Writable) value);
        boolean status = readHiveRecordAndInsertIntoRecordBatch(deSerializedValue, recordCount);
        if (!status) {
          redoRecord = value;
          setValueCountAndPopulatePartitionVectors(recordCount);
          return recordCount;
        }
        recordCount++;
      }


      setValueCountAndPopulatePartitionVectors(recordCount);
      return recordCount;
    } catch (IOException | SerDeException e) {
      throw new DrillRuntimeException(e);
    }
  }


  private boolean readHiveRecordAndInsertIntoRecordBatch(Object deSerializedValue, int outputRecordIndex) {
    boolean success;
    for (int i = 0; i < selectedColumnNames.size(); i++) {
      String columnName = selectedColumnNames.get(i);
      Object hiveValue = sInspector.getStructFieldData(deSerializedValue, sInspector.getStructFieldRef(columnName));


      if (hiveValue != null) {
        success = selectedColumnFieldConverters.get(i).setSafeValue(selectedColumnObjInspectors.get(i), hiveValue,
            vectors.get(i), outputRecordIndex);


        if (!success) {
          return false;
        }
      }
    }


    return true;
  }


  private void setValueCountAndPopulatePartitionVectors(int recordCount) {
    for (ValueVector v : vectors) {
      v.getMutator().setValueCount(recordCount);
    }


    if (partition != null) {
      populatePartitionVectors(recordCount);
    }
  }


  @Override
  public void cleanup() {
  }


  public static MinorType getMinorTypeFromHivePrimitiveTypeInfo(PrimitiveTypeInfo primitiveTypeInfo) {
    switch(primitiveTypeInfo.getPrimitiveCategory()) {
      case BINARY:
        return TypeProtos.MinorType.VARBINARY;
      case BOOLEAN:
        return TypeProtos.MinorType.BIT;
      case BYTE:
        return TypeProtos.MinorType.TINYINT;
      case DECIMAL:
        return TypeProtos.MinorType.VARCHAR;
      case DOUBLE:
        return TypeProtos.MinorType.FLOAT8;
      case FLOAT:
        return TypeProtos.MinorType.FLOAT4;
      case INT:
        return TypeProtos.MinorType.INT;
      case LONG:
        return TypeProtos.MinorType.BIGINT;
      case SHORT:
        return TypeProtos.MinorType.SMALLINT;
      case STRING:
      case VARCHAR:
        return TypeProtos.MinorType.VARCHAR;
      case TIMESTAMP:
        return TypeProtos.MinorType.TIMESTAMP;
      case DATE:
        return TypeProtos.MinorType.DATE;
    }


    throwUnsupportedHiveDataTypeError(primitiveTypeInfo.getPrimitiveCategory().toString());
    return null;
  }


  public static MinorType getMinorTypeFromHiveTypeInfo(TypeInfo typeInfo) {
    switch (typeInfo.getCategory()) {
      case PRIMITIVE:
        return getMinorTypeFromHivePrimitiveTypeInfo(((PrimitiveTypeInfo) typeInfo));


      case LIST:
      case MAP:
      case STRUCT:
      case UNION:
      default:
        throwUnsupportedHiveDataTypeError(typeInfo.getCategory().toString());
    }


    return null;
  }


  protected void populatePartitionVectors(int recordCount) {
    for (int i = 0; i < pVectors.size(); i++) {
      int size = 50;
      ValueVector vector = pVectors.get(i);
      Object val = selectedPartitionValues.get(i);
      PrimitiveCategory pCat = ((PrimitiveTypeInfo)selectedPartitionTypes.get(i)).getPrimitiveCategory();
      if (pCat == PrimitiveCategory.BINARY || pCat == PrimitiveCategory.STRING || pCat == PrimitiveCategory.VARCHAR) {
        size = ((byte[]) selectedPartitionValues.get(i)).length;
      }


      VectorAllocator.getAllocator(vector, size).alloc(recordCount);


      switch(pCat) {
        case BINARY: {
          VarBinaryVector v = (VarBinaryVector) vector;
          byte[] value = (byte[]) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case BOOLEAN: {
          BitVector v = (BitVector) vector;
          Boolean value = (Boolean) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().set(j, value ? 1 : 0);
          }
          break;
        }
        case BYTE: {
          TinyIntVector v = (TinyIntVector) vector;
          byte value = (byte) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case DOUBLE: {
          Float8Vector v = (Float8Vector) vector;
          double value = (double) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case FLOAT: {
          Float4Vector v = (Float4Vector) vector;
          float value = (float) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case INT: {
          IntVector v = (IntVector) vector;
          int value = (int) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case LONG: {
          BigIntVector v = (BigIntVector) vector;
          long value = (long) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case SHORT: {
          SmallIntVector v = (SmallIntVector) vector;
          short value = (short) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case VARCHAR:
        case STRING: {
          VarCharVector v = (VarCharVector) vector;
          byte[] value = (byte[]) val;
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case TIMESTAMP: {
          TimeStampVector v = (TimeStampVector) vector;
          DateTime ts = new DateTime(((Timestamp) val).getTime()).withZoneRetainFields(DateTimeZone.UTC);
          long value = ts.getMillis();
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case DATE: {
          DateVector v = (DateVector) vector;
          DateTime date = new DateTime(((Date)val).getTime()).withZoneRetainFields(DateTimeZone.UTC);
          long value = date.getMillis();
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        case DECIMAL: {
          VarCharVector v = (VarCharVector) vector;
          byte[] value = ((HiveDecimal) val).toString().getBytes();
          for (int j = 0; j < recordCount; j++) {
            v.getMutator().setSafe(j, value);
          }
          break;
        }
        default:
          throwUnsupportedHiveDataTypeError(pCat.toString());
      }
      vector.getMutator().setValueCount(recordCount);
    }
  }


  /** Partition value is received in string format. Convert it into appropriate object based on the type. */
  private Object convertPartitionType(TypeInfo typeInfo, String value) {
    if (typeInfo.getCategory() != Category.PRIMITIVE) {
      // In Hive only primitive types are allowed as partition column types.
      throw new DrillRuntimeException("Non-Primitive types are not allowed as partition column type in Hive, " +
          "but received one: " + typeInfo.getCategory());
    }


    PrimitiveCategory pCat = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory();
    switch (pCat) {
      case BINARY:
        return value.getBytes();
      case BOOLEAN:
        return Boolean.parseBoolean(value);
      case BYTE:
        return Byte.parseByte(value);
      case DECIMAL:
        return new HiveDecimal(value);
      case DOUBLE:
        return Double.parseDouble(value);
      case FLOAT:
        return Float.parseFloat(value);
      case INT:
        return Integer.parseInt(value);
      case LONG:
        return Long.parseLong(value);
      case SHORT:
        return Short.parseShort(value);
      case STRING:
      case VARCHAR:
        return value.getBytes();
      case TIMESTAMP:
        return Timestamp.valueOf(value);
      case DATE:
        return Date.valueOf(value);
    }


    throwUnsupportedHiveDataTypeError(pCat.toString());
    return null;
  }


  public static void throwUnsupportedHiveDataTypeError(String unsupportedType) {
    StringBuilder errMsg = new StringBuilder();
    errMsg.append(String.format("Unsupported Hive data type %s. ", unsupportedType));
    errMsg.append(System.getProperty("line.separator"));
    errMsg.append("Following Hive data types are supported in Drill for querying: ");
    errMsg.append(
        "BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, DATE, TIMESTAMP, BINARY, DECIMAL, STRING, and VARCHAR");


    throw new RuntimeException(errMsg.toString());
  }
}
Source Code of org.apache.drill.exec.store.hive.HiveRecordReader

Related Classes of org.apache.drill.exec.store.hive.HiveRecordReader