// Overwrite the key with the normalized URL
key.set(url);
if (value instanceof CrawlDatum) {
CrawlDatum datum = (CrawlDatum)value;
if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
// Tell the reducer to get rid of all instances of this key
output.collect(key, new NutchWritable(new BooleanWritable(true)));
}
}
else if (value instanceof ParseData) {
// get the parse data and the outlinks from the parse data, along with
// the fetch time for those links
ParseData data = (ParseData)value;
long fetchTime = getFetchTime(data);
Outlink[] outlinkAr = data.getOutlinks();
Map<String, String> outlinkMap = new LinkedHashMap<String, String>();
// normalize urls and put into map
if (outlinkAr != null && outlinkAr.length > 0) {
for (int i = 0; i < outlinkAr.length; i++) {
Outlink outlink = outlinkAr[i];
String toUrl = normalizeUrl(outlink.getToUrl());
if (filterUrl(toUrl) == null) {
continue;
}
// only put into map if the url doesn't already exist in the map or
// if it does and the anchor for that link is null, will replace if
// url is existing
boolean existingUrl = outlinkMap.containsKey(toUrl);
if (toUrl != null
&& (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
outlinkMap.put(toUrl, outlink.getAnchor());
}
}
}
// collect the outlinks under the fetch time
for (String outlinkUrl : outlinkMap.keySet()) {
String anchor = outlinkMap.get(outlinkUrl);
LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
output.collect(key, new NutchWritable(datum));
}
}
else if (value instanceof LinkDatum) {
LinkDatum datum = (LinkDatum)value;
String linkDatumUrl = normalizeUrl(datum.getUrl());
if (filterUrl(linkDatumUrl) != null) {
datum.setUrl(linkDatumUrl);
// collect existing outlinks from existing OutlinkDb
output.collect(key, new NutchWritable(datum));
}
}