Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
BaseScoreGenerator scorer = new FixedScoreGenerator();
BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
// Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);
// Create a named pipe for the status of the mod_mbox-generated pages.
Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());
// Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
// can be big, e.g. 4MB).
FetcherPolicy defaultPolicy = new FetcherPolicy();
defaultPolicy.setMaxContentSize(MAX_CONTENT_SIZE);
fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);
// We can create the fetch pipe, and set up our Mbox splitter to run on content.
FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);
// Now create the pipe that's going to analyze the emails we get after splitting them up.
Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
analysisPipe = new Each(analysisPipe, new ParseEmailFunction());