* @param origPageDB the intput pagedb.
* @param destPageDB the output pagedb that will hold the trimmed result.
* @param availableDiscoveryPages the number of discovery pages present in the input pagedb.
*/
public void trimPageDB (PageDB origPageDB, PageDB destPageDB, CrawlerProgress progress) throws IOException, MalformedURLException {
Page bestPage;
bestPage = new Page("",0);
bestPage.setDistance(Integer.MAX_VALUE);
bestPage.setRetries(0);
bestPage.setLastAttempt(0);
bestPage.setLastSuccess(0);
bestPage.setLastChange(0);
bestPage.setPriority(-Float.MAX_VALUE);
bestPage.setEmitted(false);
bestPage.setSignature(new TextSignature(""));
bestPage.setAntiScore(0);
boolean hasAntiScore = false;
boolean unfetched = false;
int inlinks = 0;
logger.debug("Trimming the pagedb");
origPageDB.open(PageDB.READ);
destPageDB.open(PageDB.WRITE + PageDB.UNSORTED);
destPageDB.setSameCycleAs(origPageDB);
long dbSize = origPageDB.getSize();
long dbFetchedSize = origPageDB.getFetchedSize();
if (null != progress) {
progress.startTrim(dbSize);
origPageDB.setProgressHandler(progress);
}
int counter = 0;
PageRank pageRank = new PageRank(dbSize);
PageRank badRank = new PageRank(dbSize);
PageFilter pageFilter = new PageFilter(maxDistance, maxRetries, dbFetchedSize, discoveryFrontSize, progress.discovered());
// This code produces one page for each block of same-url pages.
// The produced page has the best properties of the block,
// Unfetched pages in the block contribute to the pagerank of the
// resulting page. If there are no unfetched pages in the block,
// the fetched page is simply copied.
for (Page page : origPageDB) {
if (!Crawler.running()) break;
if (null != progress) {
counter++;
if (counter >= 1000) {
progress.addTrimmed(counter);
progress.report();
counter = 0;
}
}
// get data for this page
String url = page.getUrl().toLowerCase();
logger.debug(" reading page " + url);
long lastAttempt = page.getLastAttempt();
long lastSuccess = page.getLastSuccess();
long lastChange = page.getLastChange();
if (url.equals(bestPage.getUrl().toLowerCase())) { // both pages have the same url
logger.debug(" same page, will keep reading.");
if (page.getScore() < 0) {
// this is not a real link but a back-propagation vector for the anti-score
hasAntiScore = true;
badRank.addContribution(page.getAntiScore());
} else {
// add the anchor to the list of anchors of this block
bestPage.addAnchors(page.getAnchors());
// add the urls of the pages linking to this block
bestPage.addParents(page.getParents());
// if this page has not been fetched, mark the block as unfetched,
// add its score to the block score and count it as an incomming link
if (lastSuccess == 0L) {
unfetched = true;
pageRank.addContribution(page.getScore());
inlinks++;
}
// keep the shortest distance
int distance = page.getDistance();
if (distance < bestPage.getDistance()) {
bestPage.setDistance(distance);
}
// keep the latest fetch
if (lastAttempt > bestPage.getLastAttempt()) {
bestPage.setLastAttempt(lastAttempt);
}
// keep the latest success
if (lastSuccess > bestPage.getLastSuccess()) {
bestPage.setLastSuccess(lastSuccess);
}
// keep the latest change
if (lastChange > bestPage.getLastChange()) {
bestPage.setLastChange(lastChange);
}
// keep the least retries
int retries = page.getRetries();
if (lastSuccess < lastAttempt || lastSuccess == 0) {
// if this page has not been successfuly fetched keep the most retries
// (one will be for the actual attempt, the rest will be unattempted links)
if (retries > bestPage.getRetries()) {
bestPage.setRetries(retries);
}
}
// keep the old priority, antiscore, hash and emitted
if (lastSuccess > 0) {
bestPage.setSignature(page.getSignature());
bestPage.setEmitted(page.isEmitted());
bestPage.setPriority(page.getPriority());
bestPage.setAntiScore(page.getAntiScore());
}
}
} else { // The page is not a duplicate
if (bestPage.getUrl().length() > 0) {
// if this is not the first page, write the best of the last block of similar pages
logger.debug(" new page, will write previous one: " + bestPage.getUrl());
bestPage.setNumInlinks(inlinks);
if (unfetched) {
bestPage.setScore(pageRank.getPageScore());
}
if (hasAntiScore) {
// new antiscore is the average between the original value and the children contributions
float antiScore = (badRank.getPageScore() + bestPage.getAntiScore()) / 2f;
bestPage.setAntiScore(antiScore);
}
if (pageFilter.shouldWrite (destPageDB, bestPage)) {
updatePriority(bestPage);
destPageDB.addPage(bestPage);
}
}
// this page starts a new block of similar pages, record its properties
bestPage.setUrl(page.getUrl());
bestPage.setDistance(page.getDistance());
bestPage.setLastAttempt(page.getLastAttempt());
bestPage.setLastSuccess(page.getLastSuccess());
bestPage.setLastChange(page.getLastChange());
bestPage.setRetries(page.getRetries());
bestPage.setAnchors(page.getAnchors());
bestPage.setParents(page.getParents());
bestPage.setScore(page.getScore());
bestPage.setAntiScore(page.getAntiScore());
bestPage.setPriority(page.getPriority());
bestPage.setSignature(page.getSignature());
bestPage.setEmitted(page.isEmitted());
unfetched = (page.getLastSuccess() == 0L);
hasAntiScore = (page.getScore() < 0);
inlinks = 0;
pageRank.reset();
badRank.reset();
if (hasAntiScore) {
badRank.addContribution(page.getAntiScore());
} else if (unfetched) {
pageRank.addContribution(page.getScore());
inlinks++;
}
}
}
if (null != progress) {
progress.addTrimmed(counter);
progress.report();
}
if (bestPage.getUrl().length() > 0) {
// if the orig pagedb is not empty, write the best of the last similar block of pages
logger.debug(" pagedb is over, will write last one: " + bestPage.getUrl());
bestPage.setNumInlinks(inlinks);
if (unfetched) {
bestPage.setScore(pageRank.getPageScore());
}
if (hasAntiScore) {
// new antiscore is the average between the original value and the children contributions
float antiScore = (badRank.getPageScore() + bestPage.getAntiScore()) / 2f;
bestPage.setAntiScore(antiScore);
}
if (pageFilter.shouldWrite (destPageDB, bestPage)) {
updatePriority(bestPage);
destPageDB.addPage(bestPage);
}