// apply redirect repr url logic for each datum
for (CrawlDatum datum : datums) {
MapWritable metadata = datum.getMetaData();
Text reprUrl = (Text)metadata.get(Nutch.WRITABLE_REPR_URL_KEY);
byte status = datum.getStatus();
boolean isCrawlDb = (CrawlDatum.hasDbStatus(datum));
boolean segFetched = (status == CrawlDatum.STATUS_FETCH_SUCCESS);
// only if the crawl datum is from the crawldb or is a successfully
// fetched page from the segments
if ((isCrawlDb || segFetched) && reprUrl != null) {
String src = reprUrl.toString();
String dest = url;
URL srcUrl = null;
URL dstUrl = null;
// both need to be well formed urls
try {
srcUrl = new URL(src);
dstUrl = new URL(url);
catch (MalformedURLException e) {
// if the src and repr urls are the same after the new logic then
// remove the repr url from the metadata as it is no longer needed
if (srcUrl != null && dstUrl != null) {
String reprOut = URLUtil.chooseRepr(src, dest, true);
if (reprOut.equals(dest)) {
LOG.info("Removing " + reprOut + " from " + dest);
// collect each datum