* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param proxyConfig
*/
public void deadlinkCleaner() {
final Log log = new Log("URLDBCLEANUP");
final HashSet<String> damagedURLS = new HashSet<String>();
try {
final Iterator<URIMetadataRow> eiter = entries(true, null);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
iteratorCount++;
} catch (final RuntimeException e) {
if(e.getMessage() != null) {
final String m = e.getMessage();
damagedURLS.add(m.substring(m.length() - 12));
} else {
log.logSevere("RuntimeException:", e);
}
}
log.logInfo("URLs vorher: " + this.urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
final HTTPClient client = new HTTPClient();
final Iterator<String> eiter2 = damagedURLS.iterator();
byte[] urlHashBytes;
while (eiter2.hasNext()) {
urlHashBytes = ASCII.getBytes(eiter2.next());
// trying to fix the invalid URL
String oldUrlStr = null;
try {
// getting the url data as byte array
final Row.Entry entry = this.urlIndexFile.get(urlHashBytes, true);
// getting the wrong url string
oldUrlStr = entry.getColUTF8(1).trim();
int pos = -1;
if ((pos = oldUrlStr.indexOf("://",0)) != -1) {
// trying to correct the url
final String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
final DigestURI newUrl = new DigestURI(newUrlStr);
if (client.HEADResponse(newUrl.toString()) != null
&& client.getHttpResponse().getStatusLine().getStatusCode() == 200) {
entry.setCol(1, UTF8.getBytes(newUrl.toString()));
this.urlIndexFile.put(entry);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
remove(urlHashBytes);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (client.getHttpResponse() == null ? "null" : client.getHttpResponse().getStatusLine()));
}
}
} catch (final Exception e) {
remove(urlHashBytes);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
}
}
log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size());
} catch (final IOException e) {
log.logSevere("IOException", e);
}
}