while (values.hasNext()) {
ObjectWritable objWrite = values.next();
Object value = objWrite.get();
if (value instanceof CrawlDatum) {
CrawlDatum datum = (CrawlDatum)value;
if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
fetchDatums.add(datum);
}
}
else if (value instanceof Node) {
nodeDb = (Node)value;
}
else if (value instanceof ParseData
&& ((ParseData)value).getStatus().isSuccess()) {
parseData = (ParseData)value;
}
else if (value instanceof ParseText) {
parseText = (ParseText)value;
}
}
// if not successfully fetched and parsed then stop processing
int numDatums = fetchDatums.size();
if (numDatums == 0 || nodeDb == null || parseText == null
|| parseData == null) {
return;
}
// get the most recent fetch time, this is duplicates inside of a single
// segment, usually due to redirects
CrawlDatum fetchDatum = null;
long mostRecent = 0L;
for (CrawlDatum cur : fetchDatums) {
long fetchTime = cur.getFetchTime();
if (fetchDatum == null || fetchTime > mostRecent) {
fetchDatum = cur;
mostRecent = fetchTime;
}
}
// get parse metadata
Metadata metadata = parseData.getContentMeta();
Parse parse = new ParseImpl(parseText, parseData);
// handle redirect urls
Text reprUrlText = (Text)fetchDatum.getMetaData().get(
Nutch.WRITABLE_REPR_URL_KEY);
String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
String url = key.toString();
String fieldUrl = (reprUrl != null) ? reprUrl : url;
String host = URLUtil.getHost(fieldUrl);
// add segment, used to map from merged index back to segment files
FieldWritable segField = new FieldWritable(Fields.SEGMENT,
metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
false);
fieldsList.add(segField);
// add digest, used by dedup
FieldWritable digestField = new FieldWritable(Fields.DIGEST,
metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
false);
fieldsList.add(digestField);
// url is both stored and indexed, so it's both searchable and returned
fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
true, true, true));
fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
false, true, false));
if (reprUrl != null) {
// also store original url as both stored and indexes
fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
FieldType.CONTENT, true, true, true));
}
if (host != null) {
// add host as un-stored, indexed and tokenized
FieldWritable hostField = new FieldWritable(Fields.HOST, host,
FieldType.CONTENT, true, false, true);
fieldsList.add(hostField);
// add site as un-stored, indexed and un-tokenized
FieldWritable siteField = new FieldWritable(Fields.SITE, host,
FieldType.CONTENT, true, false, false);
fieldsList.add(siteField);
}
// content is indexed, so that it's searchable, but not stored in index
fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
FieldType.CONTENT, true, false, true));
// title
String title = parse.getData().getTitle();
if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
// add title indexed and stored so that it can be displayed
fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
true, true, true));
// add cached content/summary display policy, if available
String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
fieldsList.add(new FieldWritable(Fields.CACHE, caching,
FieldType.CONTENT, false, true, false));
}
// add timestamp when fetched, for deduplication
fieldsList.add(new FieldWritable(Fields.TSTAMP, DateTools.timeToString(
fetchDatum.getFetchTime(), DateTools.Resolution.MILLISECOND),
FieldType.CONTENT, false, true, false));
FieldsWritable fields = new FieldsWritable();
fields.setFieldsList(fieldsList);
output.collect(key, fields);