// Input : the crawldb
platform.assertPathExists(crawlDbPath, "CrawlDb");
// TODO VMa - figure out types Tap inputSource = platform.makeTap(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath);
Tap inputSource = platform.makeTap(platform.makeTextScheme(), crawlDbPath);
Pipe importPipe = new Pipe("import pipe");
// Apply a regex to extract the relevant fields
RegexParser crawlDbParser = new RegexParser(CrawlDbDatum.FIELDS,
"^(.*?)\t(.*?)\t(.*?)\t(.*?)\t(.*)");
importPipe = new Each(importPipe, new Fields("line"), crawlDbParser);
// Split into tuples that are to be fetched and that have already been fetched
SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());
Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
Pipe urlsToFetchPipe = splitter.getLHSPipe();
// Limit to MAX_DISTRIBUTED_FETCH if running in real cluster,
// or MAX_LOCAL_FETCH if running locally. So first we sort the entries
// from high to low by links score.
// TODO add unit test
urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
long maxToFetch = isLocal ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));
BaseScoreGenerator scorer = new LinkScoreGenerator();
// Create the sub-assembly that runs the fetch job
int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);
FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, platform.getNumReduceTasks());
Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
contentPipe = TupleLogger.makePipe(contentPipe, true);
// Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);
Pipe analyzerPipe = new Pipe("analyzer pipe");
analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());
Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());
Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
resultsPipe = new Each(resultsPipe, new CreateResultsFunction());
// Group the finished datums, the skipped datums, status, outlinks
Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin());
updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);
// output : loop dir specific crawldb
BasePath outCrawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
Tap crawlDbSink = platform.makeTap(platform.makeTextScheme(), outCrawlDbPath, SinkMode.REPLACE);
// Status,
BasePath statusDirPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath);
// Content
BasePath contentDirPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath);
// PageResults
BasePath resultsDirPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
Tap resultsSink = platform.makeTap(platform.makeTextScheme(), resultsDirPath);
// Create the output map that connects each tail pipe to the appropriate sink.
Map<String, Tap> sinkMap = new HashMap<String, Tap>();
sinkMap.put(updatePipe.getName(), crawlDbSink);
sinkMap.put(statusPipe.getName(), statusSink);