private FetchedResult doRequest(HttpRequestBase request, String url, Payload payload) throws BaseFetchException {
LOGGER.trace("Fetching " + url);
HttpResponse response;
long readStartTime;
HttpHeaders headerMap = new HttpHeaders();
String redirectedUrl = null;
String newBaseUrl = null;
int numRedirects = 0;
boolean needAbort = true;
String contentType = "";
String mimeType = "";
String hostAddress = null;
// Create a local instance of cookie store, and bind to local context
// Without this we get killed w/lots of threads, due to sync() on single cookie store.
HttpContext localContext = new BasicHttpContext();
CookieStore cookieStore = new BasicCookieStore();
localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
StringBuilder fetchTrace = null;
if (LOGGER.isTraceEnabled()) {
fetchTrace = new StringBuilder("Fetched url: " + url);
try {
request.setURI(new URI(url));
readStartTime = System.currentTimeMillis();
response = _httpClient.execute(request, localContext);
Header[] headers = response.getAllHeaders();
for (Header header : headers) {
headerMap.add(header.getName(), header.getValue());
int httpStatus = response.getStatusLine().getStatusCode();
if (LOGGER.isTraceEnabled()) {
fetchTrace.append("; status code: " + httpStatus);
if (headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH) != null) {
fetchTrace.append("; Content-Length: " + headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH));
if (headerMap.getFirst(HttpHeaderNames.LOCATION) != null) {
fetchTrace.append("; Location: " + headerMap.getFirst(HttpHeaderNames.LOCATION));
if ((httpStatus < 200) || (httpStatus >= 300)) {
// We can't just check against SC_OK, as some wackos return 201, 202, etc
throw new HttpFetchException(url, "Error fetching " + url, httpStatus, headerMap);
redirectedUrl = extractRedirectedUrl(url, localContext);
URI permRedirectUri = (URI)localContext.getAttribute(PERM_REDIRECT_CONTEXT_KEY);
if (permRedirectUri != null) {
newBaseUrl = permRedirectUri.toURL().toExternalForm();
Integer redirects = (Integer)localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY);
if (redirects != null) {
numRedirects = redirects.intValue();
hostAddress = (String)(localContext.getAttribute(HOST_ADDRESS));
if (hostAddress == null) {
throw new UrlFetchException(url, "Host address not saved in context");
Header cth = response.getFirstHeader(HttpHeaderNames.CONTENT_TYPE);
if (cth != null) {
contentType = cth.getValue();
// Check if we should abort due to mime-type filtering. Note that this will fail if the server
// doesn't report a mime-type, but that's how we want it as this configuration is typically
// used when only a subset of parsers are installed/enabled, so we don't want the auto-detect
// code in Tika to get triggered & try to process an unsupported type. If you want unknown
// mime-types from the server to be processed, set "" as one of the valid mime-types in FetcherPolicy.
mimeType = HttpUtils.getMimeTypeFromContentType(contentType);
Set<String> mimeTypes = _fetcherPolicy.getValidMimeTypes();
if ((mimeTypes != null) && (mimeTypes.size() > 0)) {
if (!mimeTypes.contains(mimeType)) {
throw new AbortedFetchException(url, "Invalid mime-type: " + mimeType, AbortedFetchReason.INVALID_MIMETYPE);
needAbort = false;
} catch (ClientProtocolException e) {
// Oleg guarantees that no abort is needed in the case of an IOException (which is is a subclass of)
needAbort = false;
// If the root case was a "too many redirects" error, we want to map this to a specific
// exception that contains the final redirect.
if (e.getCause() instanceof MyRedirectException) {
MyRedirectException mre = (MyRedirectException)e.getCause();
String redirectUrl = url;
try {
redirectUrl = mre.getUri().toURL().toExternalForm();
} catch (MalformedURLException e2) {
LOGGER.warn("Invalid URI saved during redirect handling: " + mre.getUri());
throw new RedirectFetchException(url, redirectUrl, mre.getReason());
} else if (e.getCause() instanceof RedirectException) {
throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS);
} else {
throw new IOFetchException(url, e);
} catch (IOException e) {
// Oleg guarantees that no abort is needed in the case of an IOException
needAbort = false;
if (e instanceof ConnectionPoolTimeoutException) {
// Should never happen, so let's dump some info about the connection pool.
ThreadSafeClientConnManager cm = (ThreadSafeClientConnManager)_httpClient.getConnectionManager();
int numConnections = cm.getConnectionsInPool();
cm.closeIdleConnections(0, TimeUnit.MILLISECONDS);
LOGGER.error(String.format("Got ConnectionPoolTimeoutException: %d connections before, %d after idle close", numConnections, cm.getConnectionsInPool()));
throw new IOFetchException(url, e);
} catch (URISyntaxException e) {
throw new UrlFetchException(url, e.getMessage());
} catch (IllegalStateException e) {
throw new UrlFetchException(url, e.getMessage());
} catch (BaseFetchException e) {
throw e;
} catch (Exception e) {
// Map anything else to a generic IOFetchException
// TODO KKr - create generic fetch exception
throw new IOFetchException(url, new IOException(e));
} finally {
safeAbort(needAbort, request);
// Figure out how much data we want to try to fetch.
int maxContentSize = getMaxContentSize(mimeType);
int targetLength = maxContentSize;
boolean truncated = false;
String contentLengthStr = headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH);
if (contentLengthStr != null) {
try {
int contentLength = Integer.parseInt(contentLengthStr);
if (contentLength > targetLength) {
truncated = true;
} else {
targetLength = contentLength;
} catch (NumberFormatException e) {
// Ignore (and log) invalid content length values.
LOGGER.warn("Invalid content length in header: " + contentLengthStr);
// Now finally read in response body, up to targetLength bytes.
// Note that entity might be null, for zero length responses.
byte[] content = new byte[0];
long readRate = 0;
HttpEntity entity = response.getEntity();
needAbort = true;
if (entity != null) {
InputStream in = null;
try {
in = entity.getContent();
byte[] buffer = new byte[BUFFER_SIZE];
int bytesRead = 0;
int totalRead = 0;
ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTEARRAY_SIZE);
int readRequests = 0;
int minResponseRate = _fetcherPolicy.getMinResponseRate();
// TODO KKr - we need to monitor the rate while reading a
// single block. Look at HttpClient
// metrics support for how to do this. Once we fix this, fix
// the test to read a smaller (< 20K)
// chuck of data.
while ((totalRead < targetLength)
&& ((bytesRead =, 0, Math.min(buffer.length, targetLength - totalRead))) != -1)) {
readRequests += 1;
totalRead += bytesRead;
out.write(buffer, 0, bytesRead);
// Assume read time is at least one millisecond, to avoid DBZ exception.
long totalReadTime = Math.max(1, System.currentTimeMillis() - readStartTime);
readRate = (totalRead * 1000L) / totalReadTime;
// Don't bail on the first read cycle, as we can get a hiccup starting out.
// Also don't bail if we've read everything we need.
if ((readRequests > 1) && (totalRead < targetLength) && (readRate < minResponseRate)) {
throw new AbortedFetchException(url, "Slow response rate of " + readRate + " bytes/sec", AbortedFetchReason.SLOW_RESPONSE_RATE);
// Check to see if we got interrupted.
if (Thread.interrupted()) {
throw new AbortedFetchException(url, AbortedFetchReason.INTERRUPTED);
content = out.toByteArray();
needAbort = truncated || (in.available() > 0);
} catch (IOException e) {
// We don't need to abort if there's an IOException
throw new IOFetchException(url, e);
} finally {
safeAbort(needAbort, request);
// Toss truncated image content.
if ( (truncated)
&& (!isTextMimeType(mimeType))) {
throw new AbortedFetchException(url, "Truncated image", AbortedFetchReason.CONTENT_SIZE);
// Now see if we need to uncompress the content.
String contentEncoding = headerMap.getFirst(HttpHeaderNames.CONTENT_ENCODING);
if (contentEncoding != null) {
if (LOGGER.isTraceEnabled()) {
fetchTrace.append("; Content-Encoding: " + contentEncoding);