long delay = serverDelay;
if (checkRobots) {
try {
if (!robots.isAllowed(this, u)) {
return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
}
} catch (Throwable e) {
// XXX Maybe bogus: assume this is allowed.
if (logger.isTraceEnabled()) {
logger.trace("Exception checking robot rules for " + url + ": " + e);
}
}
long crawlDelay = robots.getCrawlDelay(this, u);
delay = crawlDelay > 0 ? crawlDelay : serverDelay;
}
if (checkBlocking && maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
// skip this page, otherwise the thread would block for too long.
LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="
+ (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000));
return new ProtocolOutput(null, ProtocolStatus.STATUS_WOULDBLOCK);
}
String host = null;
if (checkBlocking) {
try {
host = blockAddr(u, delay);
} catch (BlockedException be) {
return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
}
}
Response response;
try {
response = getResponse(u, datum, false); // make a request
} finally {
if (checkBlocking) unblockAddr(host, delay);
}
int code = response.getCode();
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
(content == null ? EMPTY_CONTENT : content),
response.getHeader("Content-Type"),
response.getHeaders(), this.conf);
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
} else if (code == 410) { // page is gone
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
} else if (code >= 300 && code < 400) { // handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null) location = response.getHeader("location");
if (location == null) location = "";
u = new URL(u, location);
int protocolStatusCode;
switch (code) {
case 300: // multiple choices, preferred value in Location
protocolStatusCode = ProtocolStatus.MOVED;
break;
case 301: // moved permanently
case 305: // use proxy (Location is URL of proxy)
protocolStatusCode = ProtocolStatus.MOVED;
break;
case 302: // found (temporarily moved)
case 303: // see other (redirect after POST)
case 307: // temporary redirect
protocolStatusCode = ProtocolStatus.TEMP_MOVED;
break;
case 304: // not modified
protocolStatusCode = ProtocolStatus.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) { // requires authorization, but no valid auth provided.
if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); }
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+ urlString));
} else if (code == 404) {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
} else {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
+ u));
}
} catch (Throwable e) {
e.printStackTrace(LogUtil.getErrorStream(logger));
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}