DemoCrawlToolOptions options = new DemoCrawlToolOptions();
options.setUseBoilerpipe(true);
options.setLocalPlatformMode(true);
UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
Server server = null;
try {
server = startServer(new FakeWebSiteHandler(), 8089);
Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
flow.complete();
// Update the crawlDb path
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
// Now we should have an output/1-<timestamp>/ directory, where the
// /urls dir has 11 entries with
// one being previously crawled, and the other 10 being pending.
Tap crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
TupleEntryIterator iter = crawldbTap.openForRead(platform.makeFlowProcess());
int numFetched = 0;
int numPending = 0;
while (iter.hasNext()) {
CrawlDbDatum datum = new CrawlDbDatum(iter.next());
UrlStatus status = datum.getLastStatus();
int crawlDepth = datum.getCrawlDepth();
if (datum.getLastFetched() != 0) {
numFetched += 1;
assertEquals(UrlStatus.FETCHED, status);
assertEquals(0, crawlDepth);
} else {
numPending += 1;
assertEquals(UrlStatus.UNFETCHED, status);
assertEquals(1, crawlDepth);
}
}
assertEquals(1, numFetched);
assertEquals(10, numPending);
// Do it one more time, to verify status gets propagated forward.
curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 2);
flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options);
flow.complete();
// Update crawldb path
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
crawldbTap = platform.makeTap(platform.makeBinaryScheme(CrawlDbDatum.FIELDS), crawlDbPath);
iter = crawldbTap.openForRead(platform.makeFlowProcess());
numFetched = 0;
numPending = 0;
int numDepth0 = 0;
int numDepth1 = 0;
int numDepth2 = 0;
while (iter.hasNext()) {
CrawlDbDatum datum = new CrawlDbDatum(iter.next());
UrlStatus status = datum.getLastStatus();
int depth = datum.getCrawlDepth();
if (datum.getLastFetched() != 0) {
numFetched += 1;
assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
} else {
numPending += 1;
assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
}
if (depth == 0) {
numDepth0 += 1;
} else if (depth == 1) {
numDepth1 += 1;
} else if (depth == 2) {
numDepth2 += 1;
} else {
fail("Invalid crawl depth for " + datum.getUrl());
}
// System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
// datum.getUrl(), datum.getLastStatus(),
// datum.getLastFetched(), depth));
}
assertEquals(11, numFetched);
assertEquals(100, numPending);
assertEquals(1, numDepth0);
assertEquals(10, numDepth1);
assertEquals(100, numDepth2);
} catch (Throwable t) {
fail(t.getMessage());
} finally {
if (server != null) {
server.stop();
}
}
}