OutputCollector<Text, FieldsWritable> output, Reporter reporter)
throws IOException {
Node nodeDb = null;
List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
ParseData parseData = null;
ParseText parseText = null;
List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();
// assign values, url must be successfully fetched and parsed
while (values.hasNext()) {
ObjectWritable objWrite = values.next();
Object value = objWrite.get();
if (value instanceof CrawlDatum) {
CrawlDatum datum = (CrawlDatum)value;
if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
fetchDatums.add(datum);
}
}
else if (value instanceof Node) {
nodeDb = (Node)value;
}
else if (value instanceof ParseData
&& ((ParseData)value).getStatus().isSuccess()) {
parseData = (ParseData)value;
}
else if (value instanceof ParseText) {
parseText = (ParseText)value;
}
}
// if not successfully fetched and parsed then stop processing
int numDatums = fetchDatums.size();
if (numDatums == 0 || nodeDb == null || parseText == null
|| parseData == null) {
return;
}
// get the most recent fetch time, this is duplicates inside of a single
// segment, usually due to redirects
CrawlDatum fetchDatum = null;
long mostRecent = 0L;
for (CrawlDatum cur : fetchDatums) {
long fetchTime = cur.getFetchTime();
if (fetchDatum == null || fetchTime > mostRecent) {
fetchDatum = cur;
mostRecent = fetchTime;
}
}
// get parse metadata
Metadata metadata = parseData.getContentMeta();
Parse parse = new ParseImpl(parseText, parseData);
// handle redirect urls
Text reprUrlText = (Text)fetchDatum.getMetaData().get(
Nutch.WRITABLE_REPR_URL_KEY);