Pipe robotsPipe = new Each(urlProvider, new GroupFunction(new GroupByDomain()));
robotsPipe = new GroupBy("Grouping URLs by IP/delay", robotsPipe, GroupedUrlDatum.getGroupingField());
robotsPipe = new Every(robotsPipe, new FilterAndScoreByUrlAndRobots(robotsFetcher, parser, scorer), Fields.RESULTS);
// Split into records for URLs that are special (not fetchable) and regular
SplitterAssembly splitter = new SplitterAssembly(robotsPipe, new SplitIntoSpecialAndRegularKeys());
// Now generate sets of URLs to fetch. We'll wind up with all URLs for the same server & the same crawl delay,
// ordered by score, getting passed per list to the PreFetchBuffer. This will generate PreFetchDatums that contain a key
// based on the hash of the IP address (with a range of values == number of reducers), plus a list of URLs and a target
// crawl time.
Pipe prefetchPipe = new GroupBy("Distributing URL sets", splitter.getRHSPipe(), GroupedUrlDatum.getGroupingField(), ScoredUrlDatum.getSortingField(), true);
prefetchPipe = new Every(prefetchPipe, new MakeFetchSetsBuffer(fetchJobPolicy, numReducers), Fields.RESULTS);
Pipe fetchPipe = new GroupBy("Fetching URL sets", prefetchPipe, FetchSetDatum.getGroupingField(), FetchSetDatum.getSortingField());
fetchPipe = new Every(fetchPipe, new FetchBuffer(fetcher), Fields.RESULTS);
Pipe fetchedContent = new Pipe(CONTENT_PIPE_NAME, new Each(fetchPipe, new FilterErrorsFunction()));
Pipe fetchedStatus = new Pipe("fetched status", new Each(fetchPipe, new MakeStatusFunction()));
// We need to merge URLs from the LHS of the splitter (never fetched) so that our status pipe
// gets status for every URL we put into this sub-assembly.
Pipe skippedStatus = new Pipe("skipped status", new Each(splitter.getLHSPipe(), new MakeSkippedStatus()));
// TODO KKr You're already setting the group name here (so that the
// tail pipe gets the same name), so I wasn't able to pass in a
// group name here for BaseTool.nameFlowSteps to use for the job name.
Pipe joinedStatus = new GroupBy(STATUS_PIPE_NAME, Pipe.pipes(skippedStatus, fetchedStatus), new Fields(StatusDatum.URL_FN));