}
public static int fetch(Page page, boolean ignoreIfBinary) {
String toFetchURL = page.getWebURL().getURL();
HttpGet get = null;
HttpEntity entity = null;
try {
get = new HttpGet(toFetchURL);
synchronized (mutex) {
long now = (new Date()).getTime();
if (now - startOfPeriod > 10000) {
logger.info("Number of pages fetched per second: " + processedCount
/ ((now - startOfPeriod) / 1000));
processedCount = 0;
startOfPeriod = now;
}
processedCount++;
if (now - lastFetchTime < politenessDelay) {
Thread.sleep(politenessDelay - (now - lastFetchTime));
}
lastFetchTime = (new Date()).getTime();
}
HttpResponse response = httpclient.execute(get);
entity = response.getEntity();
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode != HttpStatus.SC_OK) {
if (statusCode != HttpStatus.SC_NOT_FOUND) {
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
Header header = response.getFirstHeader("Location");
if (header != null) {
String movedToUrl = header.getValue();
page.getWebURL().setURL(movedToUrl);
} else {
page.getWebURL().setURL(null);
}
return PageFetchStatus.Moved;
}
logger.info("Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL);
} else if (show404Pages) {
logger.info("Not Found: " + toFetchURL + " (Link found in doc#: "
+ page.getWebURL().getParentDocid() + ")");
}
return response.getStatusLine().getStatusCode();
}
String uri = get.getURI().toString();
if (!uri.equals(toFetchURL)) {
if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
int newdocid = DocIDServer.getDocID(uri);
if (newdocid != -1) {
if (newdocid > 0) {
return PageFetchStatus.RedirectedPageIsSeen;
}
WebURL webURL = new WebURL();
webURL.setURL(uri);
webURL.setDocid(DocIDServer.getNewDocID(uri));
page.setWebURL(webURL);
}
}
}
if (entity != null) {
long size = entity.getContentLength();
if (size == -1) {
Header length = response.getLastHeader("Content-Length");
if (length == null) {
length = response.getLastHeader("Content-length");
}
if (length != null) {
size = Integer.parseInt(length.getValue());
} else {
size = -1;
}
}
if (size > MAX_DOWNLOAD_SIZE) {
entity.consumeContent();
return PageFetchStatus.PageTooBig;
}
boolean isBinary = false;
Header type = entity.getContentType();
if (type != null) {
String typeStr = type.getValue().toLowerCase();
if (typeStr.contains("image") || typeStr.contains("audio") || typeStr.contains("video")) {
isBinary = true;
if (ignoreIfBinary) {
return PageFetchStatus.PageIsBinary;
}
}
}
if (page.load(entity.getContent(), (int) size, isBinary)) {
return PageFetchStatus.OK;
} else {
return PageFetchStatus.PageLoadError;
}
} else {
get.abort();
}
} catch (IOException e) {
logger.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL
+ " (link found in doc #" + page.getWebURL().getParentDocid() + ")");
return PageFetchStatus.FatalTransportError;
} catch (IllegalStateException e) {
// ignoring exceptions that occur because of not registering https
// and other schemes
} catch (Exception e) {
if (e.getMessage() == null) {
logger.error("Error while fetching " + page.getWebURL().getURL());
} else {
logger.error(e.getMessage() + " while fetching " + page.getWebURL().getURL());
}
} finally {
try {
if (entity != null) {
entity.consumeContent();
} else if (get != null) {
get.abort();
}
} catch (Exception e) {
e.printStackTrace();