List<Pipe> tailPipes = new ArrayList<Pipe>();
if (options.isGenerateHTML()) {
// Let's write out the parse as text:
Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE);
textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity());
BasePath textParsePath = platform.makePath(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
Tap textParseTap = platform.makeTap(platform.makeTextScheme(), textParsePath, SinkMode.REPLACE);
sinkMap.put(textParsePipe.getName(), textParseTap);
tailPipes.add(textParsePipe);
}
// Let's output a WritableSequenceFile as an example - this file can
// then be used as input when working with Mahout.
// For now we only do it when we are running in Hadoop mode
Tap writableSeqFileSink = null;
Pipe writableSeqFileDataPipe = null;
if (!options.isLocalPlatformMode()) {
writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
BasePath writableSeqFileDataPath = platform.makePath(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
WritableSequenceFile writableSeqScheme = new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class);
writableSeqFileSink = platform.makeTap(writableSeqScheme, writableSeqFileDataPath, SinkMode.REPLACE);
}
Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
if (urlFilter != null) {
urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
}
urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);
// Take status and output urls from it
Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);
// Finally join the URLs we get from parsing content with the URLs we got
// from the status ouput, and the urls we didn't process from the db so that
// we have a unified stream of all known URLs for the crawldb.
Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);
// NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums
// and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
// The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
new Fields(UrlDatum.URL_FN));
crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);
Pipe outputPipe = new Pipe ("output pipe");
outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());