* @param key
* @param page
* @return newly-discovered webpage (via a meta-redirect)
*/
public URLWebPage process(String key, WebPage page) {
URLWebPage redirectedPage = null;
String url = TableUtil.unreverseUrl(key);
byte status = (byte) page.getStatus();
if (status != CrawlStatus.STATUS_FETCHED) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status));
}
return redirectedPage;
}
Parse parse;
try {
parse = parse(url, page);
} catch (ParserNotFound e) {
// do not print stacktrace for the fact that some types are not mapped.
LOG.warn("No suitable parser found: " + e.getMessage());
return redirectedPage;
} catch (final Exception e) {
LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException(e));
return redirectedPage;
}
if (parse == null) {
return redirectedPage;
}
final byte[] signature = sig.calculate(page);
org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
page.setParseStatus(pstatus);
if (ParseStatusUtils.isSuccess(pstatus)) {
if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
String newUrl = ParseStatusUtils.getMessage(pstatus);
int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
try {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = filters.filter(newUrl);
} catch (URLFilterException e) {
return redirectedPage; // TODO: is this correct
} catch (MalformedURLException e) {
return redirectedPage;
}
if (newUrl == null || newUrl.equals(url)) {
String reprUrl = URLUtil.chooseRepr(url, newUrl,
refreshTime < FetcherJob.PERM_REFRESH_TIME);
WebPage newWebPage = new WebPage();
if (reprUrl == null) {
LOG.warn("reprUrl==null for " + url);
return redirectedPage;
} else {
page.setReprUrl(new Utf8(reprUrl));
}
page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
redirectedPage = new URLWebPage(reprUrl, newWebPage);
}
} else {
page.setText(new Utf8(parse.getText()));
page.setTitle(new Utf8(parse.getTitle()));
ByteBuffer prevSig = page.getSignature();