* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Be polite: Make sure that we don't send more than 1 request per
* second (1000 milliseconds between requests).
*/
config.setPolitenessDelay(1000);
/*
* You can set the maximum crawl depth here. The default value is -1 for
* unlimited depth
*/
config.setMaxDepthOfCrawling(2);
/*
* You can set the maximum number of pages to crawl. The default value
* is -1 for unlimited number of pages
*/
config.setMaxPagesToFetch(1000);
/*
* Do you need to set a proxy? If so, you can use:
* config.setProxyHost("proxyserver.example.com");
* config.setProxyPort(8080);
*
* If your proxy also needs authentication:
* config.setProxyUsername(username); config.getProxyPassword(password);
*/
/*
* This config parameter can be used to set your crawl to be resumable
* (meaning that you can resume the crawl from a previously
* interrupted/crashed crawl). Note: if you enable resuming feature and
* want to start a fresh crawl, you need to delete the contents of
* rootFolder manually.
*/
config.setResumableCrawling(false);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);