output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM);
Text redirUrl = new Text(newUrl);
if (maxRedirect > 0) {
redirecting = true;
redirectCount++;
fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP);
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
fiq.addInProgressFetchItem(fit);
if (LOG.isDebugEnabled()) {
LOG.debug(" - content redirect to " + redirUrl + " (fetching now)");
}
} else {
output(redirUrl, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - content redirect to " + redirUrl + " (fetching later)");
}
}
} else if (LOG.isDebugEnabled()) {
LOG.debug(" - content redirect skipped: " +
(newUrl != null ? "to same url" : "filtered"));
}
}
break;
case ProtocolStatus.MOVED: // redirect
case ProtocolStatus.TEMP_MOVED:
int code;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
} else {
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
}
output(fit.url, fit.datum, content, status, code);
String newUrl = status.getMessage();
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(fit.url.toString())) {
Text redirUrl = new Text(newUrl);
if (maxRedirect > 0) {
redirecting = true;
redirectCount++;
fit = FetchItem.create(redirUrl, new CrawlDatum(), byIP);
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
fiq.addInProgressFetchItem(fit);
if (LOG.isDebugEnabled()) {
LOG.debug(" - protocol redirect to " + redirUrl + " (fetching now)");
}
} else {
output(redirUrl, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - protocol redirect to " + redirUrl + " (fetching later)");
}
}
} else if (LOG.isDebugEnabled()) {