@Override
public void run() {
_flowProcess.increment(FetchCounters.DOMAINS_PROCESSING, 1);
try {
DomainInfo domainInfo = new DomainInfo(_protocolAndDomain);
if (!domainInfo.isValidHostAddress()) {
throw new UnknownHostException(_protocolAndDomain);
}
if (LOGGER.isTraceEnabled()) {
LOGGER.trace(String.format("Resolved %s to %s", _protocolAndDomain, domainInfo.getHostAddress()));
}
String domain = domainInfo.getDomain();
String pld = DomainNames.getPLD(domain);
if (!_scorer.isGoodDomain(domain, pld)) {
_flowProcess.increment(FetchCounters.DOMAINS_SKIPPED, 1);
_flowProcess.increment(FetchCounters.URLS_SKIPPED, _urls.size());
LOGGER.debug("Skipping URLs from not-good domain: " + domain);
emptyQueue(_urls, GroupingKey.SKIPPED_GROUPING_KEY, _collector, _flowProcess);
} else {
BaseRobotRules robotRules = RobotUtils.getRobotRules(_fetcher, _parser, new URL(domainInfo.getProtocolAndDomain() + "/robots.txt"));
String validKey = null;
boolean isDeferred = robotRules.isDeferVisits();
if (isDeferred) {
LOGGER.debug("Deferring visits to URLs from " + domainInfo.getDomain());
_flowProcess.increment(FetchCounters.DOMAINS_DEFERRED, 1);
} else {
validKey = GroupingKey.makeGroupingKey(domainInfo.getHostAddress(), robotRules.getCrawlDelay());
_flowProcess.increment(FetchCounters.DOMAINS_FINISHED, 1);
}
// Use the same key for every URL from this domain
GroupedUrlDatum datum;