Package sizzle.aggregators

Source Code of sizzle.aggregators.DistinctAggregator

package sizzle.aggregators;

import java.io.IOException;


import org.apache.hadoop.util.bloom.DynamicBloomFilter;
import org.apache.hadoop.util.bloom.Filter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;

import sizzle.io.EmitKey;

/**
* A Sizzle aggregator to estimate the set of the unique values in a dataset.
* Roughly equivalent to a distinct(*).
*
* @author anthonyu
*
*/
@AggregatorSpec(name = "distinct", formalParameters = { "int" })
public class DistinctAggregator extends Aggregator {
  // from o.a.h.io.BloomMapFile#initBloomFilter
  private static int HASH_COUNT = 5;

  private final int vectorSize;
  private final float errorRate;

  private Filter filter;

  /**
   * Construct a DistinctAggregator.
   *
   * @param arg
   *            The size of the internal table used to perform the
   *            calculation.
   */
  public DistinctAggregator(long arg) {
    super(arg);

    // this is all cribbed from o.a.h.io.BloomMapFile#initBloomFilter

    // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
    // single key, where <code> is the number of hash functions,
    // <code>n</code> is the number of keys and <code>c</code> is the
    // desired
    // max. error rate.
    // Our desired error rate is by default 0.005, i.e. 0.5%
    this.errorRate = 0.005f;
    this.vectorSize = (int) Math.ceil(-DistinctAggregator.HASH_COUNT * arg / Math.log(1.0 - Math.pow(this.errorRate, 1.0 / DistinctAggregator.HASH_COUNT)));
  }

  /** {@inheritDoc} */
  @Override
  public void start(EmitKey key) {
    super.start(key);

    // TODO: add a clear function to the bloom filter in Hadoop and use it
    // here instead of instantiating a new one for every key
    this.filter = new DynamicBloomFilter(this.vectorSize, DistinctAggregator.HASH_COUNT, Hash.MURMUR_HASH, (int) this.getArg());
  }

  /** {@inheritDoc} */
  @Override
  public void aggregate(String data, String metadata) throws IOException, InterruptedException {
    // instantiate a bloom filter input key initialized by the data
    Key key = new Key(data.getBytes());

    // if the key is already in the filter, forget it
    if (this.filter.membershipTest(key))
      return;

    // add the key to the bloom filter
    this.filter.add(key);

    // and collect it
    this.collect(data);
  }

  /** {@inheritDoc} */
  @Override
  public boolean isAssociative() {
    return true;
  }

  /** {@inheritDoc} */
  @Override
  public boolean isCommutative() {
    return true;
  }

  protected Filter getFilter() {
    return this.filter;
  }
}
TOP

Related Classes of sizzle.aggregators.DistinctAggregator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.