Package org.apache.howl.pig

Source Code of org.apache.howl.pig.HowlStorer

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.howl.pig;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.howl.common.HowlException;
import org.apache.howl.common.HowlUtil;
import org.apache.howl.data.DefaultHowlRecord;
import org.apache.howl.data.HowlRecord;
import org.apache.howl.data.schema.HowlFieldSchema;
import org.apache.howl.data.schema.HowlSchema;
import org.apache.howl.data.schema.HowlFieldSchema.Type;
import org.apache.howl.mapreduce.HowlOutputCommitter;
import org.apache.howl.mapreduce.HowlOutputFormat;
import org.apache.howl.mapreduce.HowlTableInfo;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.StoreFunc;
import org.apache.pig.StoreMetadata;
import org.apache.pig.backend.BackendException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.parser.ParseException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

/**
* HowlStorer.
*
*/

public class HowlStorer extends StoreFunc implements StoreMetadata {

  /**
   *
   */
  private static final String COMPUTED_OUTPUT_SCHEMA = "howl.output.schema";
  private final Map<String,String> partitions;
  private Schema pigSchema;
  private RecordWriter<WritableComparable<?>, HowlRecord> writer;
  private HowlSchema computedSchema;
  private static final String PIG_SCHEMA = "howl.pig.store.schema";
  private String sign;

  public HowlStorer(String partSpecs, String schema) throws ParseException, FrontendException {

    partitions = new HashMap<String, String>();
    if(partSpecs != null && !partSpecs.trim().isEmpty()){
      String[] partKVPs = partSpecs.split(",");
      for(String partKVP : partKVPs){
        String[] partKV = partKVP.split("=");
        if(partKV.length == 2) {
          partitions.put(partKV[0].trim(), partKV[1].trim());
        } else {
          throw new FrontendException("Invalid partition column specification. "+partSpecs, PigHowlUtil.PIG_EXCEPTION_CODE);
        }
      }
    }

    if(schema != null) {
      pigSchema = Utils.getSchemaFromString(schema);
    }

  }

  public HowlStorer(String partSpecs) throws ParseException, FrontendException {
    this(partSpecs, null);
  }

  public HowlStorer() throws FrontendException, ParseException{
    this(null,null);
  }

  @Override
  public void checkSchema(ResourceSchema resourceSchema) throws IOException {

    /*  Schema provided by user and the schema computed by Pig
     * at the time of calling store must match.
     */
    Schema runtimeSchema = Schema.getPigSchema(resourceSchema);
    if(pigSchema != null){
      if(! Schema.equals(runtimeSchema, pigSchema, false, true) ){
        throw new FrontendException("Schema provided in store statement doesn't match with the Schema" +
            "returned by Pig run-time. Schema provided in HowlStorer: "+pigSchema.toString()+ " Schema received from Pig runtime: "+runtimeSchema.toString(), PigHowlUtil.PIG_EXCEPTION_CODE);
      }
    } else {
      pigSchema = runtimeSchema;
    }
    UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}).setProperty(PIG_SCHEMA,ObjectSerializer.serialize(pigSchema));
  }

  /** Constructs HowlSchema from pigSchema. Passed tableSchema is the existing
   * schema of the table in metastore.
   */
  private HowlSchema convertPigSchemaToHowlSchema(Schema pigSchema, HowlSchema tableSchema) throws FrontendException{

    List<HowlFieldSchema> fieldSchemas = new ArrayList<HowlFieldSchema>(pigSchema.size());
    for(FieldSchema fSchema : pigSchema.getFields()){
      byte type = fSchema.type;
      HowlFieldSchema howlFSchema;

      try {

        // Find out if we need to throw away the tuple or not.
        if(type == DataType.BAG && removeTupleFromBag(tableSchema, fSchema)){
          List<HowlFieldSchema> arrFields = new ArrayList<HowlFieldSchema>(1);
          arrFields.add(getHowlFSFromPigFS(fSchema.schema.getField(0).schema.getField(0)));
          howlFSchema = new HowlFieldSchema(fSchema.alias, Type.ARRAY, new HowlSchema(arrFields), null);
      }
      else{
          howlFSchema = getHowlFSFromPigFS(fSchema);
      }
      fieldSchemas.add(howlFSchema);
      } catch (HowlException he){
          throw new FrontendException(he.getMessage(),PigHowlUtil.PIG_EXCEPTION_CODE,he);
      }
    }

    return new HowlSchema(fieldSchemas);
  }

  private void validateUnNested(Schema innerSchema) throws FrontendException{

    for(FieldSchema innerField : innerSchema.getFields()){
      validateAlias(innerField.alias);
      if(DataType.isComplex(innerField.type)) {
        throw new FrontendException("Complex types cannot be nested. "+innerField, PigHowlUtil.PIG_EXCEPTION_CODE);
      }
    }
  }

  private boolean removeTupleFromBag(HowlSchema tableSchema, FieldSchema bagFieldSchema) throws HowlException{

    String colName = bagFieldSchema.alias;
    for(HowlFieldSchema field : tableSchema.getFields()){
      if(colName.equalsIgnoreCase(field.getName())){
        return (field.getArrayElementSchema().get(0).getType() == Type.STRUCT) ? false : true;
      }
    }
    // Column was not found in table schema. Its a new column
    List<FieldSchema> tupSchema = bagFieldSchema.schema.getFields();
    return (tupSchema.size() == 1 && tupSchema.get(0).schema == null) ? true : false;
  }


  private HowlFieldSchema getHowlFSFromPigFS(FieldSchema fSchema) throws FrontendException, HowlException{

    byte type = fSchema.type;
    switch(type){

    case DataType.CHARARRAY:
    case DataType.BIGCHARARRAY:
      return new HowlFieldSchema(fSchema.alias, Type.STRING, null);

    case DataType.INTEGER:
      return new HowlFieldSchema(fSchema.alias, Type.INT, null);

    case DataType.LONG:
      return new HowlFieldSchema(fSchema.alias, Type.BIGINT, null);

    case DataType.FLOAT:
      return new HowlFieldSchema(fSchema.alias, Type.FLOAT, null);

    case DataType.DOUBLE:
      return new HowlFieldSchema(fSchema.alias, Type.DOUBLE, null);

    case DataType.BAG:
      Schema bagSchema = fSchema.schema;
      List<HowlFieldSchema> arrFields = new ArrayList<HowlFieldSchema>(1);
      arrFields.add(getHowlFSFromPigFS(bagSchema.getField(0)));
      return new HowlFieldSchema(fSchema.alias, Type.ARRAY, new HowlSchema(arrFields), "");

    case DataType.TUPLE:
      List<String> fieldNames = new ArrayList<String>();
      List<HowlFieldSchema> howlFSs = new ArrayList<HowlFieldSchema>();
      for( FieldSchema fieldSchema : fSchema.schema.getFields()){
        fieldNames.add( fieldSchema.alias);
        howlFSs.add(getHowlFSFromPigFS(fieldSchema));
      }
      return new HowlFieldSchema(fSchema.alias, Type.STRUCT, new HowlSchema(howlFSs), "");

    case DataType.MAP:{
      // Pig's schema contain no type information about map's keys and
      // values. So, if its a new column assume <string,string> if its existing
      // return whatever is contained in the existing column.
      HowlFieldSchema mapField = getTableCol(fSchema.alias, howlTblSchema);
      HowlFieldSchema valFS;
      List<HowlFieldSchema> valFSList = new ArrayList<HowlFieldSchema>(1);

      if(mapField != null){
        Type mapValType = mapField.getMapValueSchema().get(0).getType();

        switch(mapValType){
        case STRING:
        case BIGINT:
        case INT:
        case FLOAT:
        case DOUBLE:
          valFS = new HowlFieldSchema(fSchema.alias, mapValType, null);
          break;
        default:
          throw new FrontendException("Only pig primitive types are supported as map value types.", PigHowlUtil.PIG_EXCEPTION_CODE);
        }
        valFSList.add(valFS);
        return new HowlFieldSchema(fSchema.alias,Type.MAP,Type.STRING, new HowlSchema(valFSList),"");
      }

      // Column not found in target table. Its a new column. Its schema is map<string,string>
      valFS = new HowlFieldSchema(fSchema.alias, Type.STRING, "");
      valFSList.add(valFS);
      return new HowlFieldSchema(fSchema.alias,Type.MAP,Type.STRING, new HowlSchema(valFSList),"");
     }

    default:
      throw new FrontendException("Unsupported type: "+type+"  in Pig's schema", PigHowlUtil.PIG_EXCEPTION_CODE);
    }
  }

  @Override
  public OutputFormat getOutputFormat() throws IOException {
    return new HowlOutputFormat();
  }

  @Override
  public void prepareToWrite(RecordWriter writer) throws IOException {
    this.writer = writer;
    computedSchema = (HowlSchema)ObjectSerializer.deserialize(UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign}).getProperty(COMPUTED_OUTPUT_SCHEMA));
  }

  @Override
  public void putNext(Tuple tuple) throws IOException {

    List<Object> outgoing = new ArrayList<Object>(tuple.size());

    int i = 0;
    for(HowlFieldSchema fSchema : computedSchema.getFields()){
      outgoing.add(getJavaObj(tuple.get(i++), fSchema));
    }
    try {
      writer.write(null, new DefaultHowlRecord(outgoing));
    } catch (InterruptedException e) {
      throw new BackendException("Error while writing tuple: "+tuple, PigHowlUtil.PIG_EXCEPTION_CODE, e);
    }
  }

  private Object getJavaObj(Object pigObj, HowlFieldSchema howlFS) throws ExecException, HowlException{

    // The real work-horse. Spend time and energy in this method if there is
    // need to keep HowlStorer lean and go fast.
    Type type = howlFS.getType();

    switch(type){

    case STRUCT:
      // Unwrap the tuple.
      return ((Tuple)pigObj).getAll();
      //        Tuple innerTup = (Tuple)pigObj;
      //
      //      List<Object> innerList = new ArrayList<Object>(innerTup.size());
      //      int i = 0;
      //      for(HowlTypeInfo structFieldTypeInfo : typeInfo.getAllStructFieldTypeInfos()){
      //        innerList.add(getJavaObj(innerTup.get(i++), structFieldTypeInfo));
      //      }
      //      return innerList;
    case ARRAY:
      // Unwrap the bag.
      DataBag pigBag = (DataBag)pigObj;
      HowlFieldSchema tupFS = howlFS.getArrayElementSchema().get(0);
      boolean needTuple = tupFS.getType() == Type.STRUCT;
      List<Object> bagContents = new ArrayList<Object>((int)pigBag.size());
      Iterator<Tuple> bagItr = pigBag.iterator();

      while(bagItr.hasNext()){
        // If there is only one element in tuple contained in bag, we throw away the tuple.
        bagContents.add(needTuple ? getJavaObj(bagItr.next(), tupFS) : bagItr.next().get(0));

      }
      return bagContents;

      //    case MAP:
      //     Map<String,DataByteArray> pigMap = (Map<String,DataByteArray>)pigObj;
      //     Map<String,Long> typeMap = new HashMap<String, Long>();
      //     for(Entry<String, DataByteArray> entry: pigMap.entrySet()){
      //       typeMap.put(entry.getKey(), new Long(entry.getValue().toString()));
      //     }
      //     return typeMap;
    default:
      return pigObj;
    }
  }

  @Override
  public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {

    // Need to necessarily override this method since default impl assumes HDFS
    // based location string.
    return location;
  }

  @Override
  public void setStoreFuncUDFContextSignature(String signature) {
    sign = signature;
  }


  private void doSchemaValidations(Schema pigSchema, HowlSchema tblSchema) throws FrontendException, HowlException{

    // Iterate through all the elements in Pig Schema and do validations as
    // dictated by semantics, consult HowlSchema of table when need be.

    for(FieldSchema pigField : pigSchema.getFields()){
      byte type = pigField.type;
      String alias = pigField.alias;
      validateAlias(alias);
      HowlFieldSchema howlField = getTableCol(alias, tblSchema);

      if(DataType.isComplex(type)){
        switch(type){

        case DataType.MAP:
          if(howlField != null){
            if(howlField.getMapKeyType() != Type.STRING){
              throw new FrontendException("Key Type of map must be String "+howlField,  PigHowlUtil.PIG_EXCEPTION_CODE);
            }
            if(howlField.getMapValueSchema().get(0).isComplex()){
              throw new FrontendException("Value type of map cannot be complex" + howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
            }
          }
          break;

        case DataType.BAG:
          // Only map is allowed as complex type in tuples inside bag.
          for(FieldSchema innerField : pigField.schema.getField(0).schema.getFields()){
            if(innerField.type == DataType.BAG || innerField.type == DataType.TUPLE) {
              throw new FrontendException("Complex types cannot be nested. "+innerField, PigHowlUtil.PIG_EXCEPTION_CODE);
            }
            validateAlias(innerField.alias);
          }
          if(howlField != null){
            // Do the same validation for HowlSchema.
            HowlFieldSchema arrayFieldScehma = howlField.getArrayElementSchema().get(0);
            Type hType = arrayFieldScehma.getType();
            if(hType == Type.STRUCT){
              for(HowlFieldSchema structFieldInBag : arrayFieldScehma.getStructSubSchema().getFields()){
                if(structFieldInBag.getType() == Type.STRUCT || structFieldInBag.getType() == Type.ARRAY){
                  throw new FrontendException("Nested Complex types not allowed "+ howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
                }
              }
            }
            if(hType == Type.MAP){
              if(arrayFieldScehma.getMapKeyType() != Type.STRING){
                throw new FrontendException("Key Type of map must be String "+howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
              }
              if(arrayFieldScehma.getMapValueSchema().get(0).isComplex()){
                throw new FrontendException("Value type of map cannot be complex "+howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
              }
            }
            if(hType == Type.ARRAY) {
              throw new FrontendException("Arrays cannot contain array within it. "+howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
            }
          }
          break;

        case DataType.TUPLE:
          validateUnNested(pigField.schema);
          if(howlField != null){
            for(HowlFieldSchema structFieldSchema : howlField.getStructSubSchema().getFields()){
              if(structFieldSchema.isComplex()){
                throw new FrontendException("Nested Complex types are not allowed."+howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
              }
            }
          }
          break;

        default:
          throw new FrontendException("Internal Error.", PigHowlUtil.PIG_EXCEPTION_CODE);
        }
      }
    }

    for(HowlFieldSchema howlField : tblSchema.getFields()){

      // We dont do type promotion/demotion.
      Type hType = howlField.getType();
      switch(hType){
      case SMALLINT:
      case TINYINT:
      case BOOLEAN:
        throw new FrontendException("Incompatible type found in howl table schema: "+howlField, PigHowlUtil.PIG_EXCEPTION_CODE);
      }
    }
  }

  private void validateAlias(String alias) throws FrontendException{
    if(alias == null) {
      throw new FrontendException("Column name for a field is not specified. Please provide the full schema as an argument to HowlStorer.", PigHowlUtil.PIG_EXCEPTION_CODE);
    }
    if(alias.matches(".*[A-Z]+.*")) {
      throw new FrontendException("Column names should all be in lowercase. Invalid name found: "+alias, PigHowlUtil.PIG_EXCEPTION_CODE);
    }
  }

  // Finds column by name in HowlSchema, if not found returns null.
  private HowlFieldSchema getTableCol(String alias, HowlSchema tblSchema){

    for(HowlFieldSchema howlField : tblSchema.getFields()){
      if(howlField.getName().equalsIgnoreCase(alias)){
        return howlField;
      }
    }
    // Its a new column
    return null;
  }
  HowlSchema howlTblSchema;

  @Override
  public void cleanupOnFailure(String location, Job job) throws IOException {
    // No-op.
  }

  @Override
  public void setStoreLocation(String location, Job job) throws IOException {

    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{sign});

    String[] userStr = location.split("\\.");
    HowlTableInfo tblInfo;
    if(userStr.length == 2) {
      tblInfo = HowlTableInfo.getOutputTableInfo(PigHowlUtil.getHowlServerUri(job),
          PigHowlUtil.getHowlServerPrincipal(job), userStr[0],userStr[1],partitions);
    } else {
      tblInfo = HowlTableInfo.getOutputTableInfo(PigHowlUtil.getHowlServerUri(job),
          PigHowlUtil.getHowlServerPrincipal(job), null,userStr[0],partitions);
    }



    Configuration config = job.getConfiguration();
    if(!HowlUtil.checkJobContextIfRunningFromBackend(job)){

      Schema schema = (Schema)ObjectSerializer.deserialize(p.getProperty(PIG_SCHEMA));
      if(schema != null){
        pigSchema = schema;
      }
      if(pigSchema == null){
        throw new FrontendException("Schema for data cannot be determined.", PigHowlUtil.PIG_EXCEPTION_CODE);
      }
      try{
        HowlOutputFormat.setOutput(job, tblInfo);
      } catch(HowlException he) {
          // pass the message to the user - essentially something about the table
          // information passed to HowlOutputFormat was not right
          throw new PigException(he.getMessage(), PigHowlUtil.PIG_EXCEPTION_CODE, he);
      }
      howlTblSchema = HowlOutputFormat.getTableSchema(job);
      try{
        doSchemaValidations(pigSchema, howlTblSchema);
      } catch(HowlException he){
        throw new FrontendException(he.getMessage(), PigHowlUtil.PIG_EXCEPTION_CODE, he);
      }
      computedSchema = convertPigSchemaToHowlSchema(pigSchema,howlTblSchema);
      HowlOutputFormat.setSchema(job, computedSchema);
      p.setProperty(HowlOutputFormat.HOWL_KEY_OUTPUT_INFO, config.get(HowlOutputFormat.HOWL_KEY_OUTPUT_INFO));
      if(config.get(HowlOutputFormat.HOWL_KEY_HIVE_CONF) != null){
        p.setProperty(HowlOutputFormat.HOWL_KEY_HIVE_CONF, config.get(HowlOutputFormat.HOWL_KEY_HIVE_CONF));
      }
      if(config.get(HowlOutputFormat.HOWL_KEY_TOKEN_SIGNATURE) != null){
        p.setProperty(HowlOutputFormat.HOWL_KEY_TOKEN_SIGNATURE,
            config.get(HowlOutputFormat.HOWL_KEY_TOKEN_SIGNATURE));
      }
      p.setProperty(COMPUTED_OUTPUT_SCHEMA,ObjectSerializer.serialize(computedSchema));

    }else{
      config.set(HowlOutputFormat.HOWL_KEY_OUTPUT_INFO, p.getProperty(HowlOutputFormat.HOWL_KEY_OUTPUT_INFO));
      if(p.getProperty(HowlOutputFormat.HOWL_KEY_HIVE_CONF) != null){
        config.set(HowlOutputFormat.HOWL_KEY_HIVE_CONF, p.getProperty(HowlOutputFormat.HOWL_KEY_HIVE_CONF));
      }
      if(p.getProperty(HowlOutputFormat.HOWL_KEY_TOKEN_SIGNATURE) != null){
        config.set(HowlOutputFormat.HOWL_KEY_TOKEN_SIGNATURE,
            p.getProperty(HowlOutputFormat.HOWL_KEY_TOKEN_SIGNATURE));
      }
    }
  }

  @Override
  public void storeSchema(ResourceSchema schema, String arg1, Job job) throws IOException {
    if( job.getConfiguration().get("mapred.job.tracker", "").equalsIgnoreCase("local") ) {
      //In local mode, mapreduce will not call HowlOutputCommitter.cleanupJob.
      //Calling it from here so that the partition publish happens.
      //This call needs to be removed after MAPREDUCE-1447 is fixed.
      new HowlOutputCommitter(null).cleanupJob(job);
    }
  }

  @Override
  public void storeStatistics(ResourceStatistics stats, String arg1, Job job) throws IOException {
  }
}
TOP

Related Classes of org.apache.howl.pig.HowlStorer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.