// Set up the UserAgent for the fetcher.
UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);
// You also get to customize the FetcherPolicy
FetcherPolicy defaultPolicy = new FetcherPolicy();
defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);
// It is a good idea to set up a crawl duration when running long crawls as you may
// end up in situations where the fetch slows down due to a 'long tail' and by
// specifying a crawl duration you know exactly when the crawl will end.
int crawlDurationInMinutes = options.getCrawlDuration();
boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) :
FetcherPolicy.NO_CRAWL_END_TIME;
// By setting up a url filter we only deal with urls that we want to
// instead of all the urls that we extract.
BaseUrlFilter urlFilter = null;
List<String> patterns = null;
String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
if (regexUrlFiltersFile != null) {
patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
} else {
patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
if (domain != null) {
String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
patterns.add(domainPatterStr);
} else {
String protocolPatterStr = "+(?i)^(http|https)://*";
patterns.add(protocolPatterStr);
LOGGER.warn("Defaulting to basic url regex filtering (just suffix and protocol");
}
}
urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));
// OK, now we're ready to start looping, since we've got our current
// settings
for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {
// Adjust target end time, if appropriate.
if (hasEndTime) {
int remainingLoops = (endLoop - curLoop) + 1;
long now = System.currentTimeMillis();
long perLoopTime = (targetEndTime - now) / remainingLoops;
defaultPolicy.setCrawlEndTime(now + perLoopTime);
}
BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, outputPath, curLoop);
String curLoopDirName = curLoopDirPath.getName();
setLoopLoggerFile(logsDir+curLoopDirName, curLoop);