public static void minimizeUrlDB(final File dataHome, final File appHome, final String networkName) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
final File indexRoot2 = new File(dataHome, "DATA/INDEX2");
final Log log = new Log("URL-CLEANUP");
try {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false);
// db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false);
final int cacheMem = (int)(MemoryControl.maxMemory - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final Segment wordIndex = new Segment(
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
(long) Integer.MAX_VALUE, false, false);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().references("AAAAAAAAAAAA".getBytes(), false, false);
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash;
while (indexContainerIterator.hasNext()) {
ReferenceContainer<WordReference> wordIdxContainer = null;
try {
wordCounter++;
wordIdxContainer = indexContainerIterator.next();
// the combined container will fit, read the container
final Iterator<WordReference> wordIdxEntries = wordIdxContainer.entries();
Reference iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = wordIdxEntries.next();
final byte[] urlHash = iEntry.urlhash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
urlCounter++;
minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}
} catch (final IOException e) {}
}
if (wordCounter%500 == 0) {
wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash());
wordChunkEnd = System.currentTimeMillis();
final long duration = wordChunkEnd - wordChunkStart;
log.logInfo(wordCounter + " words scanned " +
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" +
"Duration: "+ 500*1000/duration + " words/s" +
" | Free memory: " + MemoryControl.free() +
" | Total memory: " + MemoryControl.total());
wordChunkStart = wordChunkEnd;
wordChunkStartHash = wordChunkEndHash;
}
// we have read all elements, now we can close it
wordIdxContainer = null;
} catch (final Exception e) {
log.logSevere("Exception", e);
} finally {
if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {}
}
}
log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries.");
log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries.");
currentUrlDB.close();
minimizedUrlDB.close();
wordIndex.close();
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP");
log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db");
log.logInfo("TERMINATED URL CLEANUP");
} catch (final Exception e) {
log.logSevere("Exception: " + e.getMessage(), e);
} catch (final Error e) {
log.logSevere("Error: " + e.getMessage(), e);
}
}