private ParseFilters htmlParseFilters;
private String cachingPolicy;
public Parse getParse(String url, WebPage page) {
HTMLMetaTags metaTags = new HTMLMetaTags();
String baseUrl = TableUtil.toString(page.getBaseUrl());
URL base;
try {
base = new URL(baseUrl);
} catch (MalformedURLException e) {
return ParseStatusUtils.getEmptyParse(e, getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
try {
ByteBuffer contentInOctets = page.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(),
contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining()));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(page, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(page, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
root = parse(input);
} catch (IOException e) {
LOG.error("Failed with the following IOException: ", e);
return ParseStatusUtils.getEmptyParse(e, getConf());
} catch (DOMException e) {
LOG.error("Failed with the following DOMException: ", e);
return ParseStatusUtils.getEmptyParse(e, getConf());
} catch (SAXException e) {
LOG.error("Failed with the following SAXException: ", e);
return ParseStatusUtils.getEmptyParse(e, getConf());
} catch (Exception e) {
LOG.error("Failed with the following Exception: ", e);
return ParseStatusUtils.getEmptyParse(e, getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) { // okay to index
StringBuilder sb = new StringBuilder();
if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
utils.getText(sb, root); // extract text
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
utils.getTitle(sb, root); // extract title
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) { // okay to follow links
ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
URL baseTag = utils.getBase(root);
if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); }
utils.getOutlinks(baseTag!=null?baseTag:base, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found "+outlinks.length+" outlinks in "+ url);
}
}
ParseStatus status = new ParseStatus();
status.setMajorCode(ParseStatusCodes.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatusCodes.SUCCESS_REDIRECT);
status.addToArgs(new Utf8(metaTags.getRefreshHref().toString()));
status.addToArgs(new Utf8(Integer.toString(metaTags.getRefreshTime())));
}
Parse parse = new Parse(text, title, outlinks, status);
parse = htmlParseFilters.filter(url, page, parse, metaTags, root);
if (metaTags.getNoCache()) { // not okay to cache
page.putToMetadata(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
}
return parse;