processedRecords = 0L;
delta = System.currentTimeMillis();
while (it.hasNext()) {
SegmentReader sr = (SegmentReader) it.next();
String name = sr.segmentDir.getName();
FetcherOutput fo = new FetcherOutput();
for (long i = 0; i < sr.size; i++) {
try {
if (!sr.get(i, fo, null, null, null)) break;
Document doc = new Document();
// compute boost
float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
scorePower, boostByLinkCount, fo.getAnchors().length);
doc.add(new Field("sd", name + "|" + i, true, false, false));
doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
doc.add(new Field("score", boost + "", true, false, false));
doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false));
iw.addDocument(doc);
processedRecords++;
if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) {
iw.optimize();
iw.close();
LOG.info(" - creating next subindex...");
masterDir = new File(fsmtIndexDir, "" + masters.size());
if (!masterDir.mkdirs()) {
LOG.severe("Could not create a master index dir: " + masterDir);
return;
}
masters.add(masterDir);
iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
iw.setUseCompoundFile(false);
iw.mergeFactor = INDEX_MERGE_FACTOR;
iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
}
} catch (Throwable t) {
// we can assume the data is invalid from now on - break here
LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records");
break;
}
}
}
iw.optimize();
LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms");
s1 = System.currentTimeMillis();
// merge all other indexes using the latest IndexWriter (still open):
if (masters.size() > 1) {
LOG.info(" - merging subindexes...");
stage = SegmentMergeStatus.STAGE_MERGEIDX;
IndexReader[] ireaders = new IndexReader[masters.size() - 1];
for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File)masters.get(i));
iw.addIndexes(ireaders);
for (int i = 0; i < masters.size() - 1; i++) {
ireaders[i].close();
FileUtil.fullyDelete((File)masters.get(i));
}
}
iw.close();
LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
LOG.info("* Removing duplicate entries...");
stage = SegmentMergeStatus.STAGE_DEDUP;
IndexReader ir = IndexReader.open(masterDir);
int i = 0;
long cnt = 0L;
processedRecords = 0L;
s1 = System.currentTimeMillis();
delta = s1;
TermEnum te = ir.terms();
while(te.next()) {
Term t = te.term();
if (t == null) continue;
if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
cnt++;
processedRecords = cnt / 2;
if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
// Enumerate all docs with the same URL hash or content hash
TermDocs td = ir.termDocs(t);
if (td == null) continue;
if (t.field().equals("uh")) {
// Keep only the latest version of the document with
// the same url hash. Note: even if the content
// hash is identical, other metadata may be different, so even
// in this case it makes sense to keep the latest version.
int id = -1;
String time = null;
Document doc = null;
while (td.next()) {
int docid = td.doc();
if (!ir.isDeleted(docid)) {
doc = ir.document(docid);
if (time == null) {
time = doc.get("time");
id = docid;
continue;
}
String dtime = doc.get("time");
// "time" is a DateField, and can be compared lexicographically
if (dtime.compareTo(time) > 0) {
if (id != -1) {
ir.delete(id);
}
time = dtime;
id = docid;
} else {
ir.delete(docid);
}
}
}
} else if (t.field().equals("ch")) {
// Keep only the version of the document with
// the highest score, and then with the shortest url.
int id = -1;
int ul = 0;
float score = 0.0f;
Document doc = null;
while (td.next()) {
int docid = td.doc();
if (!ir.isDeleted(docid)) {
doc = ir.document(docid);
if (ul == 0) {
try {
ul = Integer.parseInt(doc.get("ul"));
score = Float.parseFloat(doc.get("score"));
} catch (Exception e) {};
id = docid;
continue;
}
int dul = 0;
float dscore = 0.0f;
try {
dul = Integer.parseInt(doc.get("ul"));
dscore = Float.parseFloat(doc.get("score"));
} catch (Exception e) {};
int cmp = Float.compare(dscore, score);
if (cmp == 0) {
// equal scores, select the one with shortest url
if (dul < ul) {
if (id != -1) {
ir.delete(id);
}
ul = dul;
id = docid;
} else {
ir.delete(docid);
}
} else if (cmp < 0) {
ir.delete(docid);
} else {
if (id != -1) {
ir.delete(id);
}
ul = dul;
id = docid;
}
}
}
}
}
//
// keep the IndexReader open...
//
LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
stage = SegmentMergeStatus.STAGE_WRITING;
processedRecords = 0L;
Vector outDirs = new Vector();
File outDir = new File(output, SegmentWriter.getNewSegmentName());
outDirs.add(outDir);
LOG.info("* Merging all segments into " + output.getName());
s1 = System.currentTimeMillis();
delta = s1;
nfs.mkdirs(outDir);
SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
LOG.fine(" - opening first output segment in " + outDir.getName());
FetcherOutput fo = new FetcherOutput();
Content co = new Content();
ParseText pt = new ParseText();
ParseData pd = new ParseData();
int outputCnt = 0;
for (int n = 0; n < ir.maxDoc(); n++) {