OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
final Writable value = values.next().get(); // unwrap
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
final CrawlDatum datum = (CrawlDatum)value;
if (CrawlDatum.hasDbStatus(datum)) {
dbDatum = datum;
}
else if (CrawlDatum.hasFetchStatus(datum)) {
// don't index unmodified (empty) pages
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
fetchDatum = datum;
}
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
continue;
} else {
throw new RuntimeException("Unexpected status: "+datum.getStatus());
}
} else if (value instanceof ParseData) {
parseData = (ParseData)value;
// Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
if (deleteRobotsNoIndex) {
// Get the robots meta data
String robotsMeta = parseData.getMeta("robots");
// Has it a noindex for this url?
if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
// Delete it!
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
}
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
} else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: "+value.getClass());
}
}
// Whether to delete GONE or REDIRECTS
if (delete && fetchDatum != null && dbDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
}
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {
return; // only have inlinks
}
// Whether to delete pages marked as duplicates
if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
reporter.incrCounter("IndexerStatus", "Duplicates deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
return;
}
// Whether to skip DB_NOTMODIFIED pages
if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
reporter.incrCounter("IndexerStatus", "Skipped", 1);
return;
}
if (!parseData.getStatus().isSuccess() ||
fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
return;
}
NutchDocument doc = new NutchDocument();
doc.add("id", key.toString());
final Metadata metadata = parseData.getContentMeta();
// add segment, used to map from merged index back to segment files
doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
// add digest, used by dedup