public void reduce(WritableComparable key, Iterator values,
OutputCollector output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
CrawlDatum redir = null;
ParseData parseData = null;
ParseText parseText = null;
Float pagerank = null; // TODO MC
while (values.hasNext()) {
Object value = ((ObjectWritable)values.next()).get(); // unwrap
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
}
else if (value instanceof CrawlDatum) {
CrawlDatum datum = (CrawlDatum)value;
if (CrawlDatum.hasDbStatus(datum))
dbDatum = datum;
else if (CrawlDatum.hasFetchStatus(datum))
fetchDatum = datum;
else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
// redirected page
redir = datum;
else
throw new RuntimeException("Unexpected status: "+datum.getStatus());
}
else if (value instanceof ParseData) {
parseData = (ParseData)value;
}
else if (value instanceof ParseText) {
parseText = (ParseText)value;
}
else if (value instanceof FloatWritable) { // TODO MC
pagerank = ((FloatWritable)value).get();
}
else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: "+value.getClass());
}
}
if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
LOG.info("index TREC: "+key.toString()+" "+(redir==null)+" "+(fetchDatum == null)+" "+(dbDatum == null)+" "+(parseText == null)+" "+(parseData == null)+" "+(inlinks==null)+" "+(pagerank==null));
}
if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html
// XXX page was redirected - what should we do?
// XXX discard it for now
LOG.info("index REDIR:"+redir); // sanity check
return;
}
if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
if (fetchDatum == null /*|| dbDatum == null*/
|| parseText == null || parseData == null) {
return; // only have inlinks
}
}
else {
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {
return; // only have inlinks
}
}
Document doc = new Document();
Metadata metadata = parseData.getContentMeta();
if (metadata.get(Nutch.SEGMENT_NAME_KEY)==null || metadata.get(Nutch.SIGNATURE_KEY)==null) {
LOG.error("Metadata empty:"+key+" "+parseData.toString());
return;
}
// add segment, used to map from merged index back to segment files
doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
Field.Store.YES, Field.Index.NO));
// add digest, used by dedup
doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
Field.Store.YES, Field.Index.NO));
Parse parse = new ParseImpl(parseText, parseData);
try {
// run indexing filters
doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
} catch (IndexingException e) {
if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
return;
}
float boost = 1.0f;
// run scoring filters
if (dbDatum!=null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
try {
boost = this.scfilters.indexerScore((Text)key, doc, dbDatum,
fetchDatum, parse, inlinks, boost);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Error calculating score " + key + ": " + e);
}
return;
}
}
// apply boost to all indexed fields.
// doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted
// store boost for use by explain and dedup
doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO));
doc.add(new Field("inlinks", (inlinks==null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES, Field.Index.NO));
doc.add(new Field("outlinks", (parseData.getOutlinks()==null) ? "0" : Integer.toString(parseData.getOutlinks().length), Field.Store.YES, Field.Index.NO));
doc.add(new Field("pagerank", (pagerank==null) ? "0" : Float.toString(pagerank), Field.Store.YES, Field.Index.NO));
output.collect(key, new ObjectWritable(doc));
}