protected void innerProcess(final CrawlURI curi) throws InterruptedException {
// Note begin time
curi.setFetchBeginTime(System.currentTimeMillis());
// Get a reference to the HttpRecorder that is set into this ToeThread.
final Recorder rec = curi.getRecorder();
// Shall we get a digest on the content downloaded?
boolean digestContent = getDigestContent();
String algorithm = null;
if (digestContent) {
algorithm = getDigestAlgorithm();
rec.getRecordedInput().setDigest(algorithm);
} else {
// clear
rec.getRecordedInput().setDigest((MessageDigest)null);
}
FetchHTTPRequest req;
try {
req = new FetchHTTPRequest(this, curi);
} catch (URIException e) {
cleanup(curi, e, e.getMessage(), S_UNFETCHABLE_URI);
return;
}
rec.getRecordedInput().setLimits(getMaxLengthBytes(),
1000l * (long) getTimeoutSeconds(), (long) getMaxFetchKBSec());
HttpResponse response = null;
try {
response = req.execute();
addResponseContent(response, curi);
} catch (ClientProtocolException e) {
failedExecuteCleanup(curi, e);
return;
} catch (IOException e) {
failedExecuteCleanup(curi, e);
return;
}
maybeMidfetchAbort(curi, req.request);
long contentLength = -1l;
Header h = response.getLastHeader("content-length");
if (h != null && h.getValue().trim().length()>0) {
contentLength = Long.parseLong(h.getValue());
}
try {
if (!req.request.isAborted()) {
// Force read-to-end, so that any socket hangs occur here,
// not in later modules.
rec.getRecordedInput().readToEndOfContent(contentLength);
}
} catch (RecorderTimeoutException ex) {
doAbort(curi, req.request, TIMER_TRUNC);
} catch (RecorderLengthExceededException ex) {
doAbort(curi, req.request, LENGTH_TRUNC);
} catch (IOException e) {
cleanup(curi, e, "readFully", S_CONNECT_LOST);
return;
} catch (ArrayIndexOutOfBoundsException e) {
// For weird windows-only ArrayIndex exceptions from native code
// see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
// treating as if it were an IOException
cleanup(curi, e, "readFully", S_CONNECT_LOST);
return;
} finally {
rec.close();
// ensure recording has stopped
rec.closeRecorders();
// Note completion time
curi.setFetchCompletedTime(System.currentTimeMillis());
// Set the response charset into the HttpRecord if available.
setCharacterEncoding(curi, rec, response);
setSizes(curi, rec);
setOtherCodings(curi, rec, response);
}
if (digestContent) {
curi.setContentDigest(algorithm,
rec.getRecordedInput().getDigestValue());
}
if (logger.isLoggable(Level.FINE)) {
logger.fine(((curi.getFetchType() == HTTP_POST) ? "POST" : "GET")
+ " " + curi.getUURI().toString() + " "
+ response.getStatusLine().getStatusCode() + " "
+ rec.getRecordedInput().getSize() + " "
+ curi.getContentType());
}
if (isSuccess(curi) && req.addedCredentials) {
// Promote the credentials from the CrawlURI to the CrawlServer
// so they are available for all subsequent CrawlURIs on this
// server.
promoteCredentials(curi);
} else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
// 401 is not 'success'.
handle401(response, curi);
} else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED) {
// 407 - remember Proxy-Authenticate headers for later use
kp.put("proxyAuthChallenges",
extractChallenges(response, curi, ProxyAuthenticationStrategy.INSTANCE));
}
if (rec.getRecordedInput().isOpen()) {
logger.severe(curi.toString() + " RIS still open. Should have"
+ " been closed by method release: "
+ Thread.currentThread().getName());
try {
rec.getRecordedInput().close();
} catch (IOException e) {
logger.log(Level.SEVERE, "second-chance RIS close failed", e);
}
}
}