.getFetchSchedule(new JobConf(conf));
for (int i = 0; i < fetchDbStatusPairs.length; i++) {
byte fromDbStatus = fetchDbStatusPairs[i][1];
for (int j = 0; j < fetchDbStatusPairs.length; j++) {
byte fetchStatus = fetchDbStatusPairs[j][0];
CrawlDatum fromDb = null;
if (fromDbStatus == -1) {
// nothing yet in CrawlDb
// CrawlDatum added by FreeGenerator or via outlink
} else {
fromDb = new CrawlDatum();
fromDb.setStatus(fromDbStatus);
// initialize fetchInterval:
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
}
// expected db status
byte toDbStatus = fetchDbStatusPairs[j][1];
if (fetchStatus == -1) {
if (fromDbStatus == -1) {
// nothing fetched yet: new document detected via outlink
toDbStatus = STATUS_DB_UNFETCHED;
} else {
// nothing fetched but new inlinks detected: status is unchanged
toDbStatus = fromDbStatus;
}
} else if (fetchStatus == STATUS_FETCH_RETRY) {
// a simple test of fetch_retry (without retries)
if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
toDbStatus = STATUS_DB_UNFETCHED;
} else {
toDbStatus = STATUS_DB_GONE;
}
}
String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>"
: getStatusName(fromDbStatus));
String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>" : CrawlDatum
.getStatusName(fetchStatus));
LOG.info(fromDbStatusName + " + " + fetchStatusName + " => "
+ getStatusName(toDbStatus));
List<CrawlDatum> values = new ArrayList<CrawlDatum>();
for (int l = 0; l <= 2; l++) { // number of additional in-links
CrawlDatum fetch = null;
if (fetchStatus == -1) {
// nothing fetched, need at least one in-link
if (l == 0) continue;
} else {
fetch = new CrawlDatum();
if (fromDb != null) {
fetch.set(fromDb);
} else {
// not yet in CrawlDb: added by FreeGenerator
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
}
fetch.setStatus(fetchStatus);
fetch.setFetchTime(System.currentTimeMillis());
}
if (fromDb != null)
values.add(fromDb);
if (fetch != null)
values.add(fetch);