if (robotRules == null) { // cache miss
URL redir = null;
if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
try {
Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
new WebPage(), true);
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
if (redirection == null) {
// some versions of MS IIS are known to mangle this header
redirection = response.getHeader("location");
}
if (redirection != null) {
if (!redirection.startsWith("http")) {
// RFC says it should be absolute, but apparently it isn't
redir = new URL(url, redirection);
} else {
redir = new URL(redirection);
}
response = ((HttpBase)http).getResponse(redir, new WebPage(), true);
}
}
if (response.getCode() == 200) // found rules: parse them
robotRules = parseRules(url.toString(), response.getContent(),
response.getHeader("Content-Type"),
agentNames);
else if ( (response.getCode() == 403) && (!allowForbidden) )
robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
cacheRule = false;
robotRules = EMPTY_RULES;
}else
robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {