format( "Storage folder %s can not be created, please verify you have enough permissions",
storageFolder ) );
}
}
final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
siteCrawler.setNumOfCrawlers( numCrawlers );
siteCrawler.setMaxPages( maxPages );
siteCrawler.setMaxDepth( maxDepth );
siteCrawler.setPolitenessDelay(politenessDelay);
siteCrawler.addListener(new CrawlerListener() {
@Override
public void visitedPage(Page page) {
final String pageURL = page.getWebURL().getURL();
System.err.println( format("Processing page: [%s]", pageURL) );
final ParseData parseData = page.getParseData();
if (parseData instanceof HtmlParseData) {
final HtmlParseData htmlParseData = (HtmlParseData) parseData;
try {
synchronized (roverLock) {
Crawler.super.performExtraction(
new StringDocumentSource(
htmlParseData.getHtml(),
pageURL
)
);
}
} catch (Exception e) {
System.err.println(format("Error while processing page [%s], error: %s .",
pageURL, e.getMessage())
);
}
}
}
});
Runtime.getRuntime().addShutdownHook( new Thread() {
@Override
public void run() {
try {
System.err.println( Crawler.super.printReports() );
// siteCrawler.stop(); // TODO: cause shutdown hanging.
} catch (Exception e) {
e.printStackTrace(System.err);
}
}
});
siteCrawler.start(seed, pageFilter, true);
}