Package com.cloudera.flume.handlers.debug

Source Code of com.cloudera.flume.handlers.debug.BloomGeneratorDeco

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cloudera.flume.handlers.debug;

import java.io.IOException;
import java.util.Arrays;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkDecoBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.core.EventSinkDecorator;
import com.cloudera.util.bloom.BloomSet;
import com.google.common.base.Preconditions;

/**
* This decorator takes hashes of messages and then inserts them into a bloom
* filter. On deco close, the bit map representation of the bloom filter is
* transmitted into the stream.
*
* A corresponding BloomChecker can track received message and can approximately
* verify that all messages injected were included. If the generator sends
* messages that aren't received by the checker, probabilistically this should
* detect the omission.
*
* The probability of a dropped message not being detected is equivalent to the
* probability of that dropped message being a false positive if queried. (ie,
* the addition of the message to the set added no new information). Wikipedia
* says to use at a minimum of 9 bits per inserted item, and that extra hashes
* roughly decrease the probability by an order of magnitude.
*
*/
public class BloomGeneratorDeco extends EventSinkDecorator<EventSink> {
  public static final Logger LOG = LoggerFactory
      .getLogger(BloomGeneratorDeco.class);
  protected BloomSet bloom;
  final int size; // size of bloom bit array in bits
  final int hashes; // number of hashes per insertion/membership test

  public final static String A_BLOOMSETDATA = "bloomSetData";

  /**
   * This generator must have the same size and # hash as the downstream
   * BloomCheckDeco.
   */
  public BloomGeneratorDeco(EventSink s, int size, int hashes) {
    super(s);
    this.size = size;
    this.hashes = hashes;
  }

  /**
   * The default sink here is null and must be set by setSink before usage.
   *
   * This generator must have the same size and # hash as the downstream
   * BloomCheckDeco.
   */

  public BloomGeneratorDeco(int size, int hashes) {
    this(null, size, hashes);
  }

  /** {@inheritDoc} */
  @Override
  public void open() throws IOException, InterruptedException {
    bloom = new BloomSet(size, hashes);
    super.open();
  }

  /** {@inheritDoc} */
  @Override
  public void append(Event e) throws IOException, InterruptedException {
    // take a hash of the bytes contents and add them to bloom filter.
    includeEvent(bloom, e);

    // then just send the data
    super.append(e);
  }

  /** {@inheritDoc} */
  @Override
  public void close() throws IOException, InterruptedException {
    EventImpl e = new EventImpl(new byte[0]);
    addBloom(bloom, e);
    super.append(e);

    // then close
    super.close();
  }

  /**
   * Adds the hash of the event body into the specified bloom filter set.
   */
  static void includeEvent(BloomSet bloom, Event e) {
    // Arrays.hashCode results in the same value across all jdks/machines and
    // jvm instances. (e.getBody().hashcode() is not guaranteed to do this)
    int hash = Arrays.hashCode(e.getBody());
    bloom.addInt(hash);
  }

  /**
   * Adds the serialized bloom set as an attribute to the specified event.
   */
  static void addBloom(BloomSet bloom, Event e) throws IOException {
    // ship the serialize bloom filter
    e.set(A_BLOOMSETDATA, bloom.getBytes());
  }

  /**
   * Builds a BloomCheckDeco with optional specified number of bits and number
   * of hash functions.
   */
  public static SinkDecoBuilder builder() {
    return new SinkDecoBuilder() {
      @Override
      public EventSinkDecorator<EventSink> build(Context ctx, String... argv) {
        Preconditions.checkArgument(argv.length <= 2,
            "usage: bloomCheck[(sz[,hashes])]");
        int sz = 100000000; // default: 100M bits.
        int hashes = 2; // default: # of hashes per insert/lookup
        if (argv.length >= 1) {
          sz = Integer.parseInt(argv[0]);
        }
        if (argv.length >= 2) {
          hashes = Integer.parseInt(argv[1]);
        }

        return new BloomGeneratorDeco(sz, hashes);
      }
    };
  }
}
TOP

Related Classes of com.cloudera.flume.handlers.debug.BloomGeneratorDeco

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.