ByteArrayInputStream bais = new ByteArrayInputStream(content);
Metadata md = new Metadata();
String text = null;
LinkContentHandler linkHandler = new LinkContentHandler();
ContentHandler textHandler = new BodyContentHandler();
TeeContentHandler teeHandler = new TeeContentHandler(linkHandler,
textHandler);
ParseContext parseContext = new ParseContext();
// parse
try {
tika.getParser().parse(bais, teeHandler, md, parseContext);
text = textHandler.toString();
} catch (Exception e) {
LOG.error("Exception while parsing " + url, e.getMessage());
eventMeters.scope(
"error_content_parsing_" + e.getClass().getSimpleName())
.mark();
collector.fail(tuple);
eventMeters.scope("tuple_fail").mark();
return;
} finally {
try {
bais.close();
} catch (IOException e) {
LOG.error("Exception while closing stream", e);
}
}
long duration = System.currentTimeMillis() - start;
LOG.info("Parsed " + url + " in " + duration + " msec");
// get the outlinks and convert them to strings (for now)
String fromHost;
URL url_;
try {
url_ = new URL(url);
fromHost = url_.getHost().toLowerCase();
} catch (MalformedURLException e1) {
// we would have known by now as previous
// components check whether the URL is valid
LOG.error("MalformedURLException on " + url);
eventMeters.scope(
"error_outlinks_parsing_" + e1.getClass().getSimpleName())
.mark();
collector.fail(tuple);
eventMeters.scope("tuple_fail").mark();
return;
}
List<Link> links = linkHandler.getLinks();
Set<String> slinks = new HashSet<String>(links.size());
for (Link l : links) {
if (StringUtils.isBlank(l.getUri()))
continue;
String urlOL = null;