package brickhouse.udf.sketch;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
import java.util.List;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.log4j.Logger;
import brickhouse.analytics.uniques.SketchSet;
/**
* Interpret a list of strings as a sketch_set
* and return an estimated reach number
*
*/
@Description(name="estimated_reach",
value = "_FUNC_(x) - Estimate reach from a sketch set of Strings. "
)
public class EstimatedReachUDF extends GenericUDF {
private static final Logger LOG = Logger.getLogger( EstimatedReachUDF.class);
private ListObjectInspector listInspector;
private PrimitiveObjectInspector elemInspector;
private PrimitiveCategory elemCategory;
private IntObjectInspector lengthInspector;
@Override
public Object evaluate(DeferredObject[] arg0) throws HiveException {
Object listObj = arg0[0].get();
int maxItems = SketchSet.DEFAULT_MAX_ITEMS;
if( arg0.length > 1) {
maxItems = lengthInspector.get( arg0[1].get());
}
int listLen = listInspector.getListLength( listObj);
if( listLen < maxItems ) {
return (long)listLen;
}
if( listLen > maxItems) {
LOG.warn( "estimated_reach: List length " + listLen + " is greater than sketch set Max items " + maxItems);
}
Object uninspMax = listInspector.getListElement( listObj, maxItems -1);
switch( this.elemCategory) {
case STRING :
StringObjectInspector strInspector = (StringObjectInspector) elemInspector;
String lastItem = strInspector.getPrimitiveJavaObject(uninspMax);
double reach = SketchSet.EstimatedReach( lastItem, maxItems);
if( reach > listLen)
return (long)(reach);
else
return (long)listLen;
case LONG :
LongObjectInspector longInspector = (LongObjectInspector) elemInspector;
long lastHash = longInspector.get(uninspMax);
double reachHash = SketchSet.EstimatedReach( lastHash, maxItems);
if( reachHash > listLen)
return (long)(reachHash);
else
return (long)listLen;
default:
/// should not happen
throw new HiveException("Unexpected category type");
}
}
@Override
public String getDisplayString(String[] arg0) {
StringBuilder sb = new StringBuilder("estimated_reach( ");
for(int i=0; i<arg0.length - 1; ++i) {
sb.append( arg0[i]);
sb.append(" , ");
}
sb.append(arg0[arg0.length -1 ]);
sb.append(" )");
return sb.toString();
}
@Override
public ObjectInspector initialize(ObjectInspector[] arg0)
throws UDFArgumentException {
if( arg0.length != 1 && arg0.length != 2 ) {
throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size");
}
if( arg0[0].getCategory() != Category.LIST) {
throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size");
}
this.listInspector = (ListObjectInspector) arg0[0];
if(listInspector.getListElementObjectInspector().getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size");
}
this.elemInspector = (PrimitiveObjectInspector) listInspector.getListElementObjectInspector();
LOG.info(" Element category is " + this.elemInspector.getCategory());
this.elemCategory = this.elemInspector.getPrimitiveCategory();
if(this.elemCategory != PrimitiveCategory.STRING
&& this.elemCategory != PrimitiveCategory.LONG) {
throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size");
}
if( arg0.length > 1) {
if( !(arg0[1] instanceof IntObjectInspector)) {
throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size");
}
this.lengthInspector = (IntObjectInspector) arg0[1];
}
return PrimitiveObjectInspectorFactory.javaLongObjectInspector;
}
}