/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.handlers.debug;
import java.io.IOException;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkDecoBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventImpl;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.core.EventSinkDecorator;
import com.cloudera.util.bloom.BloomSet;
import com.google.common.base.Preconditions;
/**
* This decorator takes hashes of messages and then inserts them into a bloom
* filter. On deco close, the bit map representation of the bloom filter is
* transmittted into the stream.
*
* A corresponding BloomChecker can track received message and can approximately
* verify that all messages injected were included. If the generator sends
* messages that aren't received by the checker, probabilistically this should
* detect the omission.
*
* The probability of a dropped message not being detected is equivalent to the
* probability of that dropped message being a false positive if queried. (ie,
* the addition of the message to the set added no new information). Wikipedia
* says to use at a minimum of 9 bits per inserted item, and that extra hashes
* roughly decrease the probability by an order of magnitude.
*
*/
public class BloomGeneratorDeco extends EventSinkDecorator<EventSink> {
public static final Logger LOG = LoggerFactory.getLogger(BloomGeneratorDeco.class);
protected BloomSet bloom;
final int size; // size of bloom bit array in bits
final int hashes; // number of hashes per insertion/membership test
public final static String A_BLOOMSETDATA = "bloomSetData";
/**
* This generator must have the same size and # hash as the downstream
* BloomCheckDeco.
*/
public BloomGeneratorDeco(EventSink s, int size, int hashes) {
super(s);
this.size = size;
this.hashes = hashes;
}
/**
* The default sink here is null and must be set by setSink before usage.
*
* This generator must have the same size and # hash as the downstream
* BloomCheckDeco.
*/
public BloomGeneratorDeco(int size, int hashes) {
this(null, size, hashes);
}
/** {@inheritDoc} */
@Override
public void open() throws IOException {
bloom = new BloomSet(size, hashes);
super.open();
}
/** {@inheritDoc} */
@Override
public void append(Event e) throws IOException {
// take a hash of the bytes contents and add them to bloom filter.
includeEvent(bloom, e);
// then just send the data
super.append(e);
}
/** {@inheritDoc} */
@Override
public void close() throws IOException {
EventImpl e = new EventImpl(new byte[0]);
addBloom(bloom, e);
super.append(e);
// then close
super.close();
}
/**
* Adds the hash of the event body into the specified bloom filter set.
*/
static void includeEvent(BloomSet bloom, Event e) {
// Arrays.hashCode results in the same value across all jdks/machines and
// jvm instances. (e.getBody().hashcode() is not guaranteed to do this)
int hash = Arrays.hashCode(e.getBody());
bloom.addInt(hash);
}
/**
* Adds the serialized bloom set as an attribute to the specified event.
*/
static void addBloom(BloomSet bloom, Event e) throws IOException {
// ship the serialize bloom filter
e.set(A_BLOOMSETDATA, bloom.getBytes());
}
/**
* Builds a BloomCheckDeco with optional specified number of bits and number
* of hash functions.
*/
public static SinkDecoBuilder builder() {
return new SinkDecoBuilder() {
@Override
public EventSinkDecorator<EventSink> build(Context ctx, String... argv) {
Preconditions.checkArgument(argv.length <= 2,
"usage: bloomCheck[(sz[,hashes])]");
int sz = 100000000; // default: 100M bits.
int hashes = 2; // default: # of hashes per insert/lookup
if (argv.length >= 1) {
sz = Integer.parseInt(argv[0]);
}
if (argv.length >= 2) {
hashes = Integer.parseInt(argv[1]);
}
return new BloomGeneratorDeco(sz, hashes);
}
};
}
}