if (LOG.isInfoEnabled()) {
LOG.info("fetching: " + url);
}
IndexingFilters indexers = new IndexingFilters(conf);
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
CrawlDatum datum = new CrawlDatum();
ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
if (!output.getStatus().isSuccess()) {
System.out.println("Fetch failed with protocol status: " + output.getStatus());
return 0;
}
Content content = output.getContent();
if (content == null) {
System.out.println("No content for " + url);
return 0;
}
contentType = content.getContentType();
if (contentType == null) {
return -1;
}
// store the guessed content type in the crawldatum
datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
if (ParseSegment.isTruncated(content)) {
LOG.warn("Content is truncated, parse may fail!");
}
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
}
ParseResult parseResult = new ParseUtil(conf).parse(content);
NutchDocument doc = new NutchDocument();
Text urlText = new Text(url);
Inlinks inlinks = null;
Parse parse = parseResult.get(urlText);
try {
doc = indexers.filter(doc, parse, urlText, datum, inlinks);
} catch (IndexingException e) {
e.printStackTrace();
}
if (doc == null) {