Package brickhouse.udf.sketch

Source Code of brickhouse.udf.sketch.CombineSketchUDF

package brickhouse.udf.sketch;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/

import java.util.List;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;

import brickhouse.analytics.uniques.SketchSet;

/**
*  UDF to combine two sketch sets, to estimate size of set union.
*  Sketch sets can be either the set of original strings or the 
*    MD5 hashes. If array<string> is passed in, it is assumed to be
*    the original sketch_set values; if array<bigint> is used, then
*     it is assumed to be the KMin hash values created with sketch_values
*
*/
@Description(name="combine_sketch",
    value = "_FUNC_(x) - Combine two sketch sets. "
)
public class CombineSketchUDF extends GenericUDF {
  private ListObjectInspector listInspectors[];
  private PrimitiveCategory elemCategory;
  private int sketchSetSize = SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE;
 
  @Override
  public Object evaluate(DeferredObject[] arg0) throws HiveException {
    SketchSet ss = new SketchSet(sketchSetSize);
    for( int i=0; i< arg0.length; ++i) {
      Object listObj = arg0[i].get();
      int listLen = listInspectors[i].getListLength(listObj);
      for(int j=0; j< listLen; ++j ) {
           Object uninspObj = listInspectors[i].getListElement(listObj, j);
           switch( elemCategory) {
           case STRING:
             StringObjectInspector strInspector = (StringObjectInspector) listInspectors[i].getListElementObjectInspector();
             String item = strInspector.getPrimitiveJavaObject(uninspObj);
             ss.addItem(item);
             break;
           case LONG:
             LongObjectInspector bigintInspector = (LongObjectInspector) listInspectors[i].getListElementObjectInspector();
             long itemHash = bigintInspector.get(uninspObj);
             ss.addHash( itemHash);
             break;
           }
      }
    }
      switch( elemCategory) {
        case STRING:
          return ss.getMinHashItems();
        case LONG:
          return ss.getMinHashes();
        default:
          /// will never happen
          throw new HiveException("Unexpected Element Category " + elemCategory);
      }
  }

  @Override
  public String getDisplayString(String[] arg0) {
    return "combine_sketch";
  }

  @Override
  public ObjectInspector initialize(ObjectInspector[] arg0)
      throws UDFArgumentException {
    if( arg0.length < 2 ) {
      throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
    }
    if(arg0[0].getCategory() != Category.LIST) {
      throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
    }
    ObjectInspector lastInspector = arg0[arg0.length -1 ];
    int listLen = arg0.length;
    if( lastInspector.getCategory() == Category.PRIMITIVE
            && ((PrimitiveObjectInspector)lastInspector).getPrimitiveCategory() == PrimitiveCategory.INT) {
       if(lastInspector instanceof ConstantObjectInspector)  {
          
       } else {
          throw new UDFArgumentException(" Sketch set size must an integer");
       }
          
    }
    this.listInspectors = new ListObjectInspector[ arg0.length];
    this.listInspectors[0] = (ListObjectInspector) arg0[0];
    if( this.listInspectors[0].getListElementObjectInspector().getCategory() != Category.PRIMITIVE) {
      throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
    }
    this.elemCategory = ((PrimitiveObjectInspector)((listInspectors[0].getListElementObjectInspector()))).getPrimitiveCategory();
    if(this.elemCategory != PrimitiveCategory.STRING && this.elemCategory != PrimitiveCategory.LONG) {
      throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
    }
    for(int i=1; i< arg0.length; ++i) {
      if( arg0[i].getCategory() != Category.LIST) {
         throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
      }
      this.listInspectors[i] = (ListObjectInspector) arg0[i];
      if(((PrimitiveObjectInspector)((listInspectors[0].getListElementObjectInspector()))).getPrimitiveCategory() != elemCategory ) {
         throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
      }
    }
    return ObjectInspectorFactory.getStandardListObjectInspector(
        PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(elemCategory));
  }

}
TOP

Related Classes of brickhouse.udf.sketch.CombineSketchUDF

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.