*/
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
}
try {
output.collect(key, new NutchWritable(datum));
if (content != null && storingContent)
output.collect(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
ParseData parseData = parse.getData();
if (!parseStatus.isSuccess()) {
LOG.warn("Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(getConf());
}
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(content, parse);
// Ensure segment name and score are in parseData metadata
parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
segmentName);
parseData.getContentMeta().set(Nutch.SIGNATURE_KEY,
StringUtil.toHexString(signature));
// Pass fetch time to content meta
parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY,
Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
String fromHost;
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks) {
try {
fromHost = new URL(url.toString()).getHost().toLowerCase();
} catch (MalformedURLException e) {
fromHost = null;
}
} else {
fromHost = null;
}
int validCount = 0;
// Process all outlinks, normalize, filter and deduplicate
List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, fromHost, ignoreExternalLinks, urlFilters, normalizers);
if (toUrl == null) {
continue;
}
validCount++;
links[i].setUrl(toUrl);
outlinkList.add(links[i]);
outlinks.add(toUrl);
}
// Only process depth N outlinks
if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());
// Counter to limit num outlinks to follow per page
int outlinkCounter = 0;
// Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links))
int maxOutlinksByDepth = (int)Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
String followUrl;
// Walk over the outlinks and add as new FetchItem to the queues
Iterator<String> iter = outlinks.iterator();
while(iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
followUrl = iter.next();
// Check whether we'll follow external outlinks
if (outlinksIgnoreExternal) {
if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
continue;
}
}
reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);
// Create new FetchItem with depth incremented
FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
fetchQueues.addFetchItem(fit);
outlinkCounter++;
}
}
// Overwrite the outlinks in ParseData with the normalized and filtered set
parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
output.collect(url, new NutchWritable(
new ParseImpl(new ParseText(parse.getText()),
parseData, parse.isCanonical())));
}
}
} catch (IOException e) {