// Build and run the flow.
try {
BixoPlatform platform = new BixoPlatform(DemoWebMiningTool.class, options.getPlatformMode());
BasePath workingDirPath = platform.makePath(options.getWorkingDir());
setupWorkingDir(platform, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);
BasePath latestDirPath = CrawlDirUtils.findLatestLoopDir(platform, workingDirPath);
if (latestDirPath == null) {
error("No previous cycle output dirs exist in " + workingDirPath, parser);
}
BasePath crawlDbPath = platform.makePath(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);
FetcherPolicy fetcherPolicy = new FetcherPolicy();
fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
// We only care about mime types that the Tika HTML parser can handle,
// so restrict it to the same.
Set<String> validMimeTypes = new HashSet<String>();
Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
for (MediaType supportedType : supportedTypes) {
validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
}
fetcherPolicy.setValidMimeTypes(validMimeTypes);
// Let's limit our crawl to two loops
for (int curLoop = 1; curLoop <= 2; curLoop++) {
BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, workingDirPath, curLoop);
Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(platform, crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options);
flow.complete();
// Update crawlDbPath to point to the latest crawl db
crawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);