Package com.cloudera.flume.reporter.sampler

Source Code of com.cloudera.flume.reporter.sampler.ReservoirSamplerDeco

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.reporter.sampler;

import java.io.IOException;
import java.util.List;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkDecoBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.core.EventSinkDecorator;
import com.cloudera.util.ReservoirSampler;
import com.google.common.base.Preconditions;

/**
* This uses a reservoir sampling to choose with uniform probability the
* specified number of events feed to this sink. When this sink is closed, it
* flushes the current sample set through to the decorated sink.
*
* This can be used in conjunction with a time-based sink (RollSink,
* HistoryReporters, etc) to throttle the number of elements sampled in a given
* amount of time in a single pass, without having to know the number of events
* that occurred in a given time period. Specifically, the HistoryReporter would
* decorate a ReservoirSamplerSink that in turn decorate an expensive filtering
* or categorization function (e.g. histogramming based on java regex for
* example).
*
* NOTE: A side effect of the reservoir sampling is that the elements in the
* sample will most likely be delivered out of order.
*/
public class ReservoirSamplerDeco<R extends EventSink> extends
    EventSinkDecorator<R> {

  final ReservoirSampler<Event> sampler;

  public ReservoirSamplerDeco(R snk, int samples) {
    super(snk);
    this.sampler = new ReservoirSampler<Event>(samples);
  }

  @Override
  public void close() throws IOException {
    flush();
    super.close();
  }

  public void flush() throws IOException {
    Preconditions.checkNotNull(sampler);
    List<Event> es = sampler.sample();
    for (Event e : es) {
      getSink().append(e);
    }
    sampler.clear();
  }

  @Override
  public void append(Event v) {
    sampler.onNext(v);
  }

  public static SinkDecoBuilder builder() {

    return new SinkDecoBuilder() {
      @Override
      public EventSinkDecorator<EventSink> build(Context context,
          String... argv) {
        Preconditions.checkArgument(argv.length == 1,
            "usage: reservoirSampler(sample)");
        int sample = Integer.parseInt(argv[0]);
        return new ReservoirSamplerDeco<EventSink>(null, sample);
      }
    };
  }

}
TOP

Related Classes of com.cloudera.flume.reporter.sampler.ReservoirSamplerDeco

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.