String inputFileName = options.getInputFile();
String outputDirName = options.getOutputDir();
try {
BixoPlatform platform = new BixoPlatform(AnalyzeEmail.class, options.getPlatformMode());
// Create the input (source tap), which is just a text file reader
BasePath inputPath = platform.makePath(inputFileName);
Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);
// Create the sub-assembly that runs the fetch job
UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
BaseScoreGenerator scorer = new FixedScoreGenerator();
BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
// Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);
// Create a named pipe for the status of the mod_mbox-generated pages.
Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());
// Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
// can be big, e.g. 4MB).
FetcherPolicy defaultPolicy = new FetcherPolicy();
defaultPolicy.setMaxContentSize(MAX_CONTENT_SIZE);
fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);
// We can create the fetch pipe, and set up our Mbox splitter to run on content.
FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);
// Now create the pipe that's going to analyze the emails we get after splitting them up.
Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
analysisPipe = new Each(analysisPipe, new ParseEmailFunction());
// We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
// the message-id field, and then do an aggregation on that of the scores.
analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.MESSAGE_ID));
analysisPipe = new Every(analysisPipe, new CalcMessageScoreBuffer(), Fields.RESULTS);
// Now we want to sum the scores for each user, which is another grouping/summing.
analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.EMAIL_ADDRESS));
analysisPipe = new Every(analysisPipe, new SumScoresBuffer(), Fields.RESULTS);
// Let's filter out anybody with an uninteresting score.
ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE), Double.class);
analysisPipe = new Each(analysisPipe, filter);
// And let's sort in reverse order (high to low score)
analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);
// Create the sink taps
BasePath outputPath = platform.makePath(outputDirName);
Tap pageStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
platform.makePath(outputPath, "page-status"), SinkMode.REPLACE);
Tap mboxStatusSinkTap = platform.makeTap(platform.makeTextScheme(),
platform.makePath(outputPath, "mbox-status"), SinkMode.REPLACE);
Tap contentSinkTap = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS),
platform.makePath(outputPath, "content"), SinkMode.REPLACE);
Tap analyzerSinkTap = platform.makeTap(platform.makeTextScheme(),
platform.makePath(outputPath, "analysis"), SinkMode.REPLACE);
HashMap<String, Tap> sinkTapMap = new HashMap<String, Tap>(2);
sinkTapMap.put(MBOX_PAGE_STATUS_PIPE_NAME, pageStatusSinkTap);
sinkTapMap.put(FetchPipe.STATUS_PIPE_NAME, mboxStatusSinkTap);
sinkTapMap.put(SPLITTER_PIPE_NAME, contentSinkTap);
sinkTapMap.put(ANALYZER_PIPE_NAME, analyzerSinkTap);
LOGGER.info("Running fetch job with " + options);
// Finally we can run it.
FlowConnector flowConnector = platform.makeFlowConnector();
Flow flow = flowConnector.connect(sourceTap, sinkTapMap, splitterPipe, mboxPageStatusPipe, analysisPipe);
flow.writeDOT("build/goodFlow.dot");
flow.complete();
} catch (Throwable t) {
System.err.println("Exception running AnalyzeEmail: " + t.getMessage());