/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.reporter.sampler;
import java.io.IOException;
import java.util.List;
import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkDecoBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.core.EventSinkDecorator;
import com.cloudera.util.ReservoirSampler;
import com.google.common.base.Preconditions;
/**
* This uses a reservoir sampling to choose with uniform probability the
* specified number of events feed to this sink. When this sink is closed, it
* flushes the current sample set through to the decorated sink.
*
* This can be used in conjunction with a time-based sink (RollSink,
* HistoryReporters, etc) to throttle the number of elements sampled in a given
* amount of time in a single pass, without having to know the number of events
* that occurred in a given time period. Specifically, the HistoryReporter would
* decorate a ReservoirSamplerSink that in turn decorate an expensive filtering
* or categorization function (e.g. histogramming based on java regex for
* example).
*
* NOTE: A side effect of the reservoir sampling is that the elements in the
* sample will most likely be delivered out of order.
*/
public class ReservoirSamplerDeco<R extends EventSink> extends
EventSinkDecorator<R> {
final ReservoirSampler<Event> sampler;
public ReservoirSamplerDeco(R snk, int samples) {
super(snk);
this.sampler = new ReservoirSampler<Event>(samples);
}
@Override
public void close() throws IOException {
flush();
super.close();
}
public void flush() throws IOException {
Preconditions.checkNotNull(sampler);
List<Event> es = sampler.sample();
for (Event e : es) {
getSink().append(e);
}
sampler.clear();
}
@Override
public void append(Event v) {
sampler.onNext(v);
}
public static SinkDecoBuilder builder() {
return new SinkDecoBuilder() {
@Override
public EventSinkDecorator<EventSink> build(Context context,
String... argv) {
Preconditions.checkArgument(argv.length == 1,
"usage: reservoirSampler(sample)");
int sample = Integer.parseInt(argv[0]);
return new ReservoirSamplerDeco<EventSink>(null, sample);
}
};
}
}