package sizzle.aggregators;
import java.io.IOException;
import org.apache.hadoop.util.bloom.DynamicBloomFilter;
import org.apache.hadoop.util.bloom.Filter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;
import sizzle.io.EmitKey;
/**
* A Sizzle aggregator to estimate the set of the unique values in a dataset.
* Roughly equivalent to a distinct(*).
*
* @author anthonyu
*
*/
@AggregatorSpec(name = "distinct", formalParameters = { "int" })
public class DistinctAggregator extends Aggregator {
// from o.a.h.io.BloomMapFile#initBloomFilter
private static int HASH_COUNT = 5;
private final int vectorSize;
private final float errorRate;
private Filter filter;
/**
* Construct a DistinctAggregator.
*
* @param arg
* The size of the internal table used to perform the
* calculation.
*/
public DistinctAggregator(long arg) {
super(arg);
// this is all cribbed from o.a.h.io.BloomMapFile#initBloomFilter
// vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
// single key, where <code> is the number of hash functions,
// <code>n</code> is the number of keys and <code>c</code> is the
// desired
// max. error rate.
// Our desired error rate is by default 0.005, i.e. 0.5%
this.errorRate = 0.005f;
this.vectorSize = (int) Math.ceil(-DistinctAggregator.HASH_COUNT * arg / Math.log(1.0 - Math.pow(this.errorRate, 1.0 / DistinctAggregator.HASH_COUNT)));
}
/** {@inheritDoc} */
@Override
public void start(EmitKey key) {
super.start(key);
// TODO: add a clear function to the bloom filter in Hadoop and use it
// here instead of instantiating a new one for every key
this.filter = new DynamicBloomFilter(this.vectorSize, DistinctAggregator.HASH_COUNT, Hash.MURMUR_HASH, (int) this.getArg());
}
/** {@inheritDoc} */
@Override
public void aggregate(String data, String metadata) throws IOException, InterruptedException {
// instantiate a bloom filter input key initialized by the data
Key key = new Key(data.getBytes());
// if the key is already in the filter, forget it
if (this.filter.membershipTest(key))
return;
// add the key to the bloom filter
this.filter.add(key);
// and collect it
this.collect(data);
}
/** {@inheritDoc} */
@Override
public boolean isAssociative() {
return true;
}
/** {@inheritDoc} */
@Override
public boolean isCommutative() {
return true;
}
protected Filter getFilter() {
return this.filter;
}
}