private void processPage(WebURL curURL) {
if (curURL == null) {
return;
}
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
if (statusCode != HttpStatus.SC_OK) {
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
if (myController.getConfig().isFollowRedirects()) {
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
return;
}
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
// Redirect page is already seen
return;
} else {
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
// TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
// Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
}
}
}
} else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
}
return;
}
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
// Redirect page is already seen
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
}
Page page = new Page(curURL);
int docid = curURL.getDocid();
if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
ParseData parseData = page.getParseData();
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
List<WebURL> toSchedule = new ArrayList<WebURL>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
webURL.setParentDocid(docid);
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is
// visited. So, we set the depth to a negative
// number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
// TODO This only proceeds if the tracked Crawl-delay comes back as 0 (instantly ok to fetch)
// Need to augment scheduling logic to delay fetches into a time-based priority queue, or some such
if (shouldVisit(webURL) && robotstxtServer.allowedIn(webURL) == 0L) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
}
}
}
}
frontier.scheduleAll(toSchedule);
}
visit(page);
}
} catch (Exception e) {
e.printStackTrace();
logger.error(e.getMessage() + ", while processing: " + curURL.getURL());
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}