String inputFileName = options.getInputFile();
String outputDirName = options.getOutputDir();
try {
BixoPlatform platform = new BixoPlatform(AnalyzeMbox.class, options.getPlatformMode());
// Create the input (source tap), which is just a sequence file reader. We assume
// that the file already has the results of splitting the mbox file into emails.
BasePath inputPath = platform.makePath(inputFileName);
platform.assertPathExists(inputPath, "input file");
Tap sourceTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
Pipe pipe = new Pipe("Email Analyzer");
pipe = new Each(pipe, new ParseEmailFunction());
// We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
// the message-id field, and then do an aggregation on that of the scores.
pipe = new GroupBy(pipe, new Fields(FieldNames.MESSAGE_ID));
pipe = new Every(pipe, new CalcMessageScoreBuffer(), Fields.RESULTS);
// Now we want to sum the scores for each user, which is another grouping/summing.
pipe = new GroupBy(pipe, new Fields(FieldNames.EMAIL_ADDRESS));
pipe = new Every(pipe, new SumScoresBuffer(), Fields.RESULTS);
// Let's filter out anybody with an uninteresting score.
ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
pipe = new Each(pipe, filter);
// And let's sort in reverse order (high to low score)
pipe = new GroupBy(pipe, new Fields(FieldNames.SUMMED_SCORE), true);
// Create the output (sink tap)
Tap sinkTap = platform.makeTap(platform.makeTextScheme(),
platform.makePath(outputDirName), SinkMode.REPLACE);
// Finally we can run it.
FlowConnector flowConnector = platform.makeFlowConnector();
Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);
flow.complete();
} catch (Throwable t) {
System.err.println("Exception running AnalyzeMbox: " + t.getMessage());
t.printStackTrace(System.err);