/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.reporter.histogram;
import java.io.IOException;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.SinkFactory.SinkBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.reporter.MultiReporter;
import com.cloudera.flume.reporter.builder.SimpleRegexReporterBuilder;
import com.google.common.base.Preconditions;
/**
* This takes a regex and a group index and generates a histogram based on the
* value extracted. Values that do not match are not counted.
*
* For example: the group index and regex combo of: 3, (\d+):(\d+):(\d+)
*
* for the following values: 123:456:789, abc:def:xyz, 11:22:33, 55:66:33
*
* would result in a histogram with (value, count) : (789, 1), (33,2).
*
* NOTE: the NFA-based regex algorithm used by java.util.regex.* (and in this
* class) is slow and does not scale. It is fully featured but has an
* exponential worst case runnning time. This will be replaced with a faster but
* more memory hungry and less featured DFA-based regex algorithm. (We will lose
* capture groups).
*/
public class RegexGroupHistogramSink extends HistogramSink {
Pattern pat;
int grp;
public RegexGroupHistogramSink(String name, Pattern pat, int grp) {
super(name);
this.pat = pat;
this.grp = grp;
}
@Override
public String extract(Event e) {
String s = new String(e.getBody());
Matcher m = pat.matcher(s);
if (m.find()) {
return m.group(grp);
}
return null;
}
public static SinkBuilder builder() {
return new SinkBuilder() {
@Override
public EventSink build(Context context, String... argv) {
Preconditions.checkArgument(argv.length == 1,
"usage: regexhistospec(regexspecfile)");
String fname = argv[0];
SimpleRegexReporterBuilder srrb = new SimpleRegexReporterBuilder(fname);
Collection<RegexGroupHistogramSink> sinks;
try {
sinks = srrb.load();
} catch (IOException e) {
throw new IllegalArgumentException(
"Failed to create regex report from spec file " + fname + ": "
+ e);
}
if (sinks.size() == 1)
return sinks.iterator().next();
EventSink snk = new MultiReporter(fname, sinks);
return snk;
}
};
}
public static SinkBuilder builderSimple() {
return new SinkBuilder() {
@Override
public EventSink build(Context context, String... argv) {
Preconditions.checkArgument(argv.length == 3,
"usage: regexhisto(name, regex, idx)");
String name = argv[0];
String regex = argv[1];
Integer idx = Integer.parseInt(argv[2]);
Pattern pat = Pattern.compile(regex);
EventSink snk = new RegexGroupHistogramSink(name, pat, idx);
return snk;
}
};
}
}