// We will end up with 1- n entries of (C)rawlDbDatum, (S)tatusDatum, (A)nalyzedDatum, (L)inkDatum
// [C | S | A | L] [C | S | A | L] [C | S | A | L] [C | S | A | L]
CrawlDbDatum crawlDbDatum = null;
StatusDatum statusDatum = null;
AnalyzedDatum analyzedDatum = null;
UrlStatus status = null;
float pageScore = 0;
float linkScore = 0;
String url = null;
while (iter.hasNext()) {
TupleEntry entry = iter.next();
boolean isCrawlDatum = entry.getString(CRAWLDBDATUM_URL_FIELD) != null;
boolean isStatus = entry.getString(STATUSDATUM_URL_FIELD) != null;
boolean isAnalyzed = entry.getString(ANALYZEDDATUM_URL_FIELD) != null;
if (isCrawlDatum) {
Tuple crawlDbTuple = TupleEntry.select(CrawlDbDatum.FIELDS, entry);
crawlDbDatum = new CrawlDbDatum(crawlDbTuple);
url = crawlDbDatum.getUrl();
}
if (isStatus) {
statusDatum = new StatusDatum(entry);
url = statusDatum.getUrl();
}
if (isAnalyzed) {
Tuple analyzedTuple = TupleEntry.select(AnalyzedDatum.FIELDS, entry);
analyzedDatum = new AnalyzedDatum(analyzedTuple);
url = analyzedDatum.getUrl();
}
// we could have either status + link or just link tuple entry
if (entry.getString(new Fields(LinkDatum.URL_FN)) != null) {
LinkDatum linkDatum = new LinkDatum(TupleEntry.select(LinkDatum.FIELDS, entry));
pageScore = linkDatum.getPageScore();
// Add up the link scores
linkScore += linkDatum.getLinkScore();
url = linkDatum.getUrl();
}
}
long lastFetched = 0;
if (crawlDbDatum != null) {
status = crawlDbDatum.getLastStatus();
pageScore = crawlDbDatum.getPageScore();
linkScore += crawlDbDatum.getLinksScore();
lastFetched = crawlDbDatum.getLastFetched();
} else if (statusDatum != null) {
status = statusDatum.getStatus();
if (status != UrlStatus.FETCHED ) {
pageScore = 0; // if we didn't fetch the page, then we can't have a page score
linkScore += (Float)statusDatum.getPayloadValue(CustomFields.LINKS_SCORE_FN);
} else {
if (analyzedDatum != null) {
pageScore = analyzedDatum.getPageScore();
}
}
lastFetched = statusDatum.getStatusTime();
} else {
status = UrlStatus.UNFETCHED;
}
if (url == null) {