Source Code of brickhouse.udf.sketch.SetSimilarityUDF

package brickhouse.udf.sketch;


import java.util.List;
import java.util.Map;


import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;


import brickhouse.analytics.uniques.SketchSet;


/**
 * Compute the Jaccard similarity of two sketch sets.
 * 
 * Jaccard Similarity is defined as the size of the intersection of two sets divided by the 
 *   size of the union of the sets. Since sketches are only approximate measures, this
 *   calculation only makes sense when the sets are roughly the same size.
 *
 */
@Description(name="set_similarity",
value = "_FUNC_(a,b) - Compute the Jaccard set similarity of two sketch sets. "
)
public class SetSimilarityUDF extends UDF {


  public Double evaluate( List<String> a, List<String> b) {
    if( a == null || b == null ) 
      return null;
    if( a.size() ==0 || b.size() == 0 ) {
      return 0.0;
    }
    /// For now, assume min sketch size is 5000...
    /// otherwise it is better to use array_intersect
    /// XXX TODO convert to GenericUDF, so that it can be passed in 
    ///  as an argument
    int sketchSize = Math.max( a.size() , b.size() );
    if( sketchSize < SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE)
        sketchSize = SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE;
    
    SketchSet sketchA = new SketchSet(sketchSize);
    SketchSet sketchB = new SketchSet(sketchSize);
    SketchSet sketchAUB = new SketchSet(sketchSize);
    
    
    for(String aStr : a) {
      sketchA.addItem( aStr);
      sketchAUB.addItem( aStr);
    }
    for(String bStr : b) {
      sketchB.addItem( bStr);
      sketchAUB.addItem( bStr);
    }
    
    double aEst = sketchA.estimateReach();
    double bEst = sketchB.estimateReach();
    double aubEst = sketchAUB.estimateReach();
    
    /// Intersection is 
    double ainterb =  aEst + bEst - aubEst;
    double sim = ainterb/aubEst;
    
    return sim;
  }
}
Source Code of brickhouse.udf.sketch.SetSimilarityUDF

Related Classes of brickhouse.udf.sketch.SetSimilarityUDF