Package com.somethingsimilar.opposite_of_a_bloom_filter

Source Code of com.somethingsimilar.opposite_of_a_bloom_filter.ByteArrayFilter

// Copyright 2012 Jeff Hodges. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package com.somethingsimilar.opposite_of_a_bloom_filter;

import java.math.RoundingMode;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicReferenceArray;

import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.math.IntMath;

/**
* ByteArrayFilter is used to filter out duplicate byte arrays from a given dataset or stream. It is
* guaranteed to never return a false positive (that is, it will never say that an item has already
* been seen by the filter when it has not) but may return a false negative.
*
* ByteArrayFilter is thread-safe.
*/
public class ByteArrayFilter {
  private static final HashFunction HASH_FUNC = Hashing.murmur3_32();
  private final int sizeMask;
  private final AtomicReferenceArray<byte[]> array;
  private static final int MAX_SIZE = 1 << 30;

  /**
   * Constructs a ByteArrayFilter with an underlying array of the given size, rounded up to the next
   * power of two.
   *
   * This rounding occurs because the hashing is much faster on an array the size of a power of two.
   * If you really want a different sized array, used the AtomicReferenceArray constructor.
   *
   * @param size The size of the underlying array.
   */
  public ByteArrayFilter(int size) {
    if (size <= 0) {
      throw new IllegalArgumentException("array size must be greater than zero, was " + size);
    }
    if (size > MAX_SIZE) {
      throw new IllegalArgumentException(
          "array size may not be larger than 2**31-1, but will be rounded to larger. was " + size);
    }
    // round to the next largest power of two
    int poweredSize = IntMath.pow(2, IntMath.log2(size, RoundingMode.CEILING));
    this.sizeMask = poweredSize - 1;
    this.array = new AtomicReferenceArray<byte[]>(poweredSize);
  }

  /**
   * Returns whether the given byte array has been previously seen by this array. That is, if a byte
   * array with the same bytes as id has been passed to to this method before.
   *
   * This method may return false when it has seen an id before. This occurs if the id passed in
   * hashes to the same index in the underlying array as another id previously checked. On the
   * flip side, this method will never return true incorrectly.
   *
   * @param id The byte array that may have been previously seen.
   * @return Whether the byte array is contained in the ByteArrayFilter.
   */
  public boolean containsAndAdd(byte[] id) {
    HashCode code = HASH_FUNC.hashBytes(id);
    int index = Math.abs(code.asInt()) & sizeMask;
    byte[] oldId = array.getAndSet(index, id);
    return Arrays.equals(id, oldId);
  }

  /**
   * Returns the size of the underlying array. Welp.
   *
   * @return The size of the underlying array.
   */
  public int getSize() {
    return array.length();
  }
}
TOP

Related Classes of com.somethingsimilar.opposite_of_a_bloom_filter.ByteArrayFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.