public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("INDEX DIFF URL-COL startup");
final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
final long start = System.currentTimeMillis();
long update = start - 7000;
int count = 0;
for (final byte[] refhash: mr) {
if (idx.get(refhash) == -1) {
// the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
hs.put(refhash);
}
count++;
if (System.currentTimeMillis() - update > 10000) {
System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining");
update = System.currentTimeMillis();
}
}
idx.close();
mr.close();
System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
count = hs.dump(new File(diffFile));
System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump");
return count;
}