options.setWorkingDir(WORKING_DIR);
options.setAgentName("test-agent");
options.setLocalPlatformMode(true);
BixoPlatform platform = new BixoPlatform(DemoWebMiningWorkflowTest.class, options.getPlatformMode());
BasePath workingDirPath = platform.makePath(WORKING_DIR);
DemoWebMiningTool.setupWorkingDir(platform, workingDirPath, "/test-seedurls.txt");
BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
FetcherPolicy fetcherPolicy = new FetcherPolicy();
fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
Set<String> validMimeTypes = new HashSet<String>();
validMimeTypes.add("text/plain");
validMimeTypes.add("text/html");
fetcherPolicy.setValidMimeTypes(validMimeTypes);
UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);
Server server = null;
try {
server = startServer(new DirectoryResponseHandler("src/test/resources/test-pages"), 8089);
BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 1);
Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
flow.complete();
// validate
BasePath statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
validateEntryCount(platform, statusPath, null, 1, "status", true);
BasePath contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 1, "content", false);
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
validateEntryCount(platform, crawlDbPath, null, 3, "crawldb", true);
// run the second loop
curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, 2);
flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
flow.complete();
// validate
statusPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
validateEntryCount(platform, statusPath, null, 2, "status", true);
contentPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
validateEntryCount(platform, contentPath, FetchedDatum.FIELDS, 2, "content", false);
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
validateEntryCount(platform, crawlDbPath, null, 8, "crawldb", true);
assertTrue(validatePageScores(platform, crawlDbPath));
BasePath resultsPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
validateEntryCount(platform, resultsPath, null, 3, "page results", true);
} finally {
if (server != null) {
server.stop();
}