Package sizzle.aggregators

Source Code of sizzle.aggregators.UniqueAggregator

package sizzle.aggregators;

import java.io.IOException;


import org.apache.hadoop.util.bloom.Filter;
import org.apache.hadoop.util.bloom.Key;

import sizzle.io.EmitKey;

/**
* A Sizzle aggregator to estimate the size of the set of unique values in a
* dataset. Roughly equivalent to a count(distinct(*)).
*
* @author anthonyu
*
*/
@AggregatorSpec(name = "unique", formalParameters = { "int" })
public class UniqueAggregator extends DistinctAggregator {
  private long total;

  /**
   * Construct a UniqueAggregator.
   *
   * @param arg
   *            The size of the internal table used to perform the
   *            calculation.
   */
  public UniqueAggregator(long arg) {
    super(arg);
  }

  /** {@inheritDoc} */
  @Override
  public void start(EmitKey key) {
    super.start(key);

    // clear out the internal total
    this.total = 0;
  }

  /** {@inheritDoc} */
  @Override
  public void aggregate(String data, String metadata) throws IOException, InterruptedException {
    // instantiate a bloom filter input key initialized by the data
    Key key = new Key(data.getBytes());

    // if the key is already in the filter, forget about it
    Filter filter = this.getFilter();
    if (filter.membershipTest(key))
      return;

    // add the key to the bloom filter
    filter.add(key);

    if (this.isCombining())
      this.collect(data);
    else
      this.total++;
  }

  /** {@inheritDoc} */
  @Override
  public void finish() throws IOException, InterruptedException {
    // if we are reducing, collect the total
    if (!this.isCombining())
      this.collect(this.total);
  }
}
TOP

Related Classes of sizzle.aggregators.UniqueAggregator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.