package brickhouse.udf.sketch;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.joda.time.Days;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import brickhouse.analytics.uniques.SketchSet;
/**
* XXX Snarfed from multiday counter ...
* TODO write one UDF which can be configured to sketch or count
* TODO Generalize to represent other periods besides Days
* TODO
* XXX Probably needs Const object inspectors
*
* Count and count uniques for several day periods
* ( i.e produce 1, 7 and 30 counts for various events)
* <p>Input is a YYYYMMDD representation of the date counts are being generated,
* a date representation of the date associated with the events,
* a bigint of the event count for that day period,
* an array of uniques for that count (or a sketch set for those uniques),
* and an array of ints representing the dates being counted over ( ie. [1,7,30] ).
* </p>
*
* <p>Output is a array of structs containing the num of days counted, the sum of events
* over that date
*/
@Description(name="multiday_sketch",
value = "_FUNC_(x) - Returns a count of events over several different periods,"
)
public class MultiDaySketcherUDAF extends AbstractGenericUDAFResolver {
private static final Logger LOG = Logger.getLogger(MultiDaySketcherUDAF.class);
private static final String SKETCH_FLAG_PROP = "klout.warehouse.multiday_sketch";
public MultiDaySketcherUDAF() {
}
/**
* Parameters are event date, event count, event uniques, asof date, period array ,
*
*/
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
throws SemanticException {
for(int i=0; i<parameters.length; ++i) {
LOG.info( "Type " + i +" == " + parameters[i].getTypeName() + " category " + parameters[i].getCategory().name() );
}
if (parameters.length != 5 && parameters.length != 6 ) {
throw new UDFArgumentTypeException(parameters.length - 1,
"multiday_sketch takes date, count, array, date, array ");
}
if( parameters[0].getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentTypeException(parameters.length - 1,
"multiday_sketch takes date, count, array, date, array ");
}
MultiDayAggUDAFEvaluator mdEval = new MultiDayAggUDAFEvaluator();
return mdEval;
}
public static class MultiDayAggUDAFEvaluator extends GenericUDAFEvaluator {
private static DateTimeFormatter yyyymmdd = DateTimeFormat.forPattern("yyyyMMdd");
private Integer[] daysArr;
private DateTime asofDate;
// For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (list
// of objs)
private StandardListObjectInspector internalMergeOI;
// For PARTIAL1 and COMPLETE: ObjectInspectors for original data
private StringObjectInspector asofInspector;
private StringObjectInspector dtInspector;
private LongObjectInspector longInspector;
private ListObjectInspector uniqInspector;
private ListObjectInspector daysArrInspector;
static class MultiDaySketchBuffer implements AggregationBuffer {
long counts[];
SketchSet[] sketches;
}
public ObjectInspector init(Mode m, ObjectInspector[] parameters)
throws HiveException {
super.init(m, parameters);
LOG.info(" MODE = " + m.name() + " Num parameters = " + parameters.length);
for(int i=0; i<parameters.length; ++i) {
LOG.info(" Parameter [ " +i + " ] == " + parameters[i]);
}
if( m.equals( Mode.PARTIAL1) || m.equals(Mode.COMPLETE)) {
Object firstParam = parameters[0];
if(firstParam instanceof StringObjectInspector) {
dtInspector = (StringObjectInspector) parameters[0];
longInspector = (LongObjectInspector) parameters[1];
uniqInspector = (ListObjectInspector) parameters[2];
asofInspector = (StringObjectInspector) parameters[3];
daysArrInspector = (ListObjectInspector) parameters[4];
}
//// return a list of list of strings ...
//// First string will the the count, rest are the uniques ...
ListObjectInspector strListInspector = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
ListObjectInspector listInspector = ObjectInspectorFactory.getStandardListObjectInspector(strListInspector);
return listInspector;
///} else if( m.equals( Mode.FINAL) || m.equals( Mode.PARTIAL2)) {
} else {
this.internalMergeOI = (StandardListObjectInspector) parameters[0];
List<String> fieldNames= new ArrayList<String>();
List<ObjectInspector> fieldInspectors= new ArrayList<ObjectInspector>();
fieldNames.add( "num_days");
fieldInspectors.add( PrimitiveObjectInspectorFactory.javaIntObjectInspector);
fieldNames.add( "cnt");
fieldInspectors.add( PrimitiveObjectInspectorFactory.javaLongObjectInspector);
fieldNames.add( "sketch_sets");
fieldInspectors.add( ObjectInspectorFactory.getStandardListObjectInspector( PrimitiveObjectInspectorFactory.javaStringObjectInspector));
ObjectInspector structType = ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldInspectors);
ObjectInspector retType = ObjectInspectorFactory.getStandardListObjectInspector(structType);
return retType;
}
}
private void addMultiDay( MultiDaySketchBuffer mdCounter, DateTime dt , Long cnt, List<Object> uniqs) {
for(int i=0; i<daysArr.length; ++i) {
int daysBetween = Days.daysBetween( dt, asofDate).getDays();
///LOG.info( " DT = "+ dt + " asofDate = " + asofDate + " daysBetween = " + daysBetween);
if(daysBetween < (Integer)daysArr[i] ) {
mdCounter.counts[i] += cnt;
///LOG.info( "Days between = " + daysBetween + " for idx "+ i + " with val " + daysArr[i] + " cnt = " + mdCounter.counts[i] );
for( Object unObj : uniqs) {
String uniqStr = ((StringObjectInspector)uniqInspector.getListElementObjectInspector()).getPrimitiveJavaObject(unObj);
///LOG.info( " Adding Unique str " + uniqStr);
mdCounter.sketches[i].addItem( uniqStr);
}
}
}
}
private void setDaysArr( Object obj) {
List inspected = this.daysArrInspector.getList(obj) ;
daysArr = new Integer[ inspected.size() ];
int idx=0;
for( Object elem : inspected) {
daysArr[idx++] = (Integer) ((IntObjectInspector)daysArrInspector.getListElementObjectInspector()).getPrimitiveJavaObject(elem);
}
}
private void setAsofDate( Object obj) {
String str = asofInspector.getPrimitiveJavaObject(obj);
asofDate = getDateTime(str);
}
private DateTime getDateTime(String str) {
DateTime dt = yyyymmdd.parseDateTime(str);
return dt;
}
private long getLong( Object obj) {
return longInspector.get( obj);
}
private List getList( Object obj) {
return this.uniqInspector.getList(obj);
}
@Override
public AggregationBuffer getNewAggregationBuffer() throws HiveException {
AggregationBuffer buff= new MultiDaySketchBuffer();
reset(buff);
return buff;
}
@Override
public void iterate(AggregationBuffer agg, Object[] parameters)
throws HiveException {
if(daysArr == null ) {
setDaysArr( parameters[4]);
reset(agg);
}
if(asofDate == null ) {
setAsofDate( parameters[3]);
}
MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg;
DateTime dt = getDateTime( dtInspector.getPrimitiveJavaObject(parameters[0]));
long cnt = getLong( parameters[1]);
List<Object> uniqList = getList( parameters[2]);
addMultiDay( myagg, dt, cnt, uniqList);
}
@Override
public void merge(AggregationBuffer agg, Object partial)
throws HiveException {
////LOG.info(" MERGE IS CALLED partial is " + partial + " AGG is " + agg);
List partialResultList = internalMergeOI.getList(partial);
if(daysArr == null) {
daysArr = new Integer[ partialResultList.size() ];
}
MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg;
if(myagg.counts == null) {
reset(myagg);
}
ListObjectInspector subListInspector = (ListObjectInspector) internalMergeOI.getListElementObjectInspector();
StringObjectInspector strInspector = (StringObjectInspector) subListInspector.getListElementObjectInspector();
int idx = 0;
for(Object strListObj : partialResultList) {
List strList = subListInspector.getList(strListObj);
String numDaysStr = strInspector.getPrimitiveJavaObject(strList.get(0));
daysArr[idx] = Integer.decode( numDaysStr);
///LOG.info(" numDays = " + numDaysStr);
String cntStr = strInspector.getPrimitiveJavaObject(strList.get(1));
///LOG.info(" Count Strr = " + cntStr);
Long cnt = Long.decode(cntStr);
myagg.counts[ idx ] += cnt;
for(int j=2; j< strList.size(); ++j) {
String uniqStr = strInspector.getPrimitiveJavaObject(strList.get(j) );
myagg.sketches[idx].addItem(uniqStr);
}
idx++;
}
}
@Override
public void reset(AggregationBuffer buff) throws HiveException {
MultiDaySketchBuffer countBuff = (MultiDaySketchBuffer) buff;
if (daysArr != null) {
countBuff.counts = new long[daysArr.length];
countBuff.sketches = new SketchSet[daysArr.length];
for (int i = 0; i < countBuff.sketches.length; ++i)
countBuff.sketches[i] = new SketchSet();
}
}
@Override
public Object terminate(AggregationBuffer agg) throws HiveException {
////LOG.info( "Terminate " + agg);
MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg;
List<List> ret = new ArrayList<List>();
for(int i=0; i<daysArr.length; ++i) {
ArrayList structArr = new ArrayList();
structArr.add(daysArr[i]); /// num_days
structArr.add( myagg.counts[i]);
List<String> sketchList = myagg.sketches[i].getMinHashItems();
structArr.add( sketchList);
ret.add( structArr);
}
return ret;
}
@Override
public Object terminatePartial(AggregationBuffer agg) throws HiveException {
///LOG.info( "Terminate partial " + agg);
MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg;
List<List> ret = new ArrayList<List>();
for(int i=0; i<daysArr.length; ++i) {
ArrayList strList = new ArrayList();
strList.add( Integer.toString( daysArr[i] ));
strList.add( Long.toString(myagg.counts[i]) );
List<String> itemList = myagg.sketches[i].getMinHashItems();
for(String minHashItem : itemList) {
strList.add( minHashItem); //// XXX TODO for sketch sets, pass the hash as well ...
}
ret.add(strList);
}
return ret;
}
}
}