// Create a temp file with a fetched url
BasePath workingDirPath = platform.makePath(WORKINGDIR);
BasePath fetchedDatumsPath = platform.makePath(workingDirPath, "fetched");
ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
fetchedDatums.add(fetchedDatum1);
createDataFile(platform, fetchedDatumsPath, fetchedDatums);
// And another with unfetched urls
BasePath unfetchedDatumsPath = platform.makePath(workingDirPath, "unfetched");
ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
unfetchedDatums.add(unfetchedDatum1);
UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
unfetchedDatums.add(unfetchedDatum2);
createDataFile(platform, unfetchedDatumsPath, unfetchedDatums);
// create a workflow
Tap inputSource1 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), fetchedDatumsPath);
Pipe fetchedPipe = new Pipe("fetched");
Tap inputSource2 = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), unfetchedDatumsPath);
Pipe unfetchedPipe = new Pipe("unfetched");
Map<String, Tap> sources = new HashMap<String, Tap>();
sources.put(fetchedPipe.getName(), inputSource1);
sources.put(unfetchedPipe.getName(), inputSource2);
BasePath resultsPath = platform.makePath(workingDirPath, "results");
Tap resultSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath, SinkMode.REPLACE);
Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
new Fields(UrlDatum.URL_FN));
resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);
FlowConnector flowConnector = platform.makeFlowConnector();
Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
flow.complete();
// verify that the resulting pipe has the latest tuple
Tap testSink = platform.makeTap(platform.makeBinaryScheme(UrlDatum.FIELDS), resultsPath);
TupleEntryIterator reader = testSink.openForRead(platform.makeFlowProcess());
int count = 0;
long latest = 0;
while (reader.hasNext()) {
TupleEntry next = reader.next();
UrlDatum datum = new UrlDatum(next);
latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
count++;
}
assertEquals(1, count);
assertEquals(2, latest);