options.setLocalPlatformMode(true);
UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
Server server = null;
try {
server = startServer(new FakeWebSiteHandler(), 8089);
Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
flow.complete();
// Update the crawlDb path
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
// Now we should have an output/1-<timestamp>/ directory, where the
// /urls dir has 11 entries with
// one being previously crawled, and the other 10 being pending.
Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
TupleEntryIterator iter = crawldbTap.openForRead(platform.makeFlowProcess());
int numFetched = 0;
int numPending = 0;
while (iter.hasNext()) {
CrawlDbDatum datum = new CrawlDbDatum(iter.next());
UrlStatus status = datum.getLastStatus();
int crawlDepth = datum.getCrawlDepth();
if (datum.getLastFetched() != 0) {
numFetched += 1;
assertEquals(UrlStatus.FETCHED, status);
assertEquals(0, crawlDepth);
} else {
numPending += 1;
assertEquals(UrlStatus.UNFETCHED, status);
assertEquals(1, crawlDepth);
}
}
assertEquals(1, numFetched);
assertEquals(10, numPending);
// Do it one more time, to verify status gets propagated forward.
curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);
flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
flow.complete();
// Update crawldb path
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
iter = crawldbTap.openForRead(platform.makeFlowProcess());