try {
segdirs = new ArrayList();
// open all segments
for (int i = 0; i < allsegdirs.size(); i++) {
File dir = (File) allsegdirs.get(i);
SegmentReader sr = null;
try {
// try to autofix it if corrupted...
sr = new SegmentReader(nfs, dir, true);
} catch (Exception e) {
// this segment is hosed beyond repair, don't use it
LOG.warning("* Segment " + dir.getName() + " is corrupt beyond repair; skipping it.");
continue;
}
segdirs.add(dir);
totalRecords += sr.size;
LOG.info(" - segment " + dir.getName() + ": " + sr.size + " records.");
readers.put(dir.getName(), sr);
}
long total = totalRecords;
LOG.info("* TOTAL " + total + " input records in " + segdirs.size() + " segments.");
LOG.info("* Creating master index...");
stage = SegmentMergeStatus.STAGE_MASTERIDX;
// XXX Note that Lucene indexes don't work with NutchFileSystem for now.
// XXX For now always assume LocalFileSystem here...
Vector masters = new Vector();
File fsmtIndexDir = new File(output, ".fastmerge_index");
File masterDir = new File(fsmtIndexDir, "0");
if (!masterDir.mkdirs()) {
LOG.severe("Could not create a master index dir: " + masterDir);
return;
}
masters.add(masterDir);
IndexWriter iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
iw.setUseCompoundFile(false);
iw.mergeFactor = INDEX_MERGE_FACTOR;
iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
long s1 = System.currentTimeMillis();
Iterator it = readers.values().iterator();
processedRecords = 0L;
delta = System.currentTimeMillis();
while (it.hasNext()) {
SegmentReader sr = (SegmentReader) it.next();
String name = sr.segmentDir.getName();
FetcherOutput fo = new FetcherOutput();
for (long i = 0; i < sr.size; i++) {
try {
if (!sr.get(i, fo, null, null, null)) break;
Document doc = new Document();
// compute boost
float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
scorePower, boostByLinkCount, fo.getAnchors().length);
doc.add(new Field("sd", name + "|" + i, true, false, false));
doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
doc.add(new Field("score", boost + "", true, false, false));
doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false));
iw.addDocument(doc);
processedRecords++;
if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) {
iw.optimize();
iw.close();
LOG.info(" - creating next subindex...");
masterDir = new File(fsmtIndexDir, "" + masters.size());
if (!masterDir.mkdirs()) {
LOG.severe("Could not create a master index dir: " + masterDir);
return;
}
masters.add(masterDir);
iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
iw.setUseCompoundFile(false);
iw.mergeFactor = INDEX_MERGE_FACTOR;
iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
}
} catch (Throwable t) {
// we can assume the data is invalid from now on - break here
LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records");
break;
}
}
}
iw.optimize();
LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms");
s1 = System.currentTimeMillis();
// merge all other indexes using the latest IndexWriter (still open):
if (masters.size() > 1) {
LOG.info(" - merging subindexes...");
stage = SegmentMergeStatus.STAGE_MERGEIDX;
IndexReader[] ireaders = new IndexReader[masters.size() - 1];
for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File)masters.get(i));
iw.addIndexes(ireaders);
for (int i = 0; i < masters.size() - 1; i++) {
ireaders[i].close();
FileUtil.fullyDelete((File)masters.get(i));
}
}
iw.close();
LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
LOG.info("* Removing duplicate entries...");
stage = SegmentMergeStatus.STAGE_DEDUP;
IndexReader ir = IndexReader.open(masterDir);
int i = 0;
long cnt = 0L;
processedRecords = 0L;
s1 = System.currentTimeMillis();
delta = s1;
TermEnum te = ir.terms();
while(te.next()) {
Term t = te.term();
if (t == null) continue;
if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
cnt++;
processedRecords = cnt / 2;
if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
// Enumerate all docs with the same URL hash or content hash
TermDocs td = ir.termDocs(t);
if (td == null) continue;
if (t.field().equals("uh")) {
// Keep only the latest version of the document with
// the same url hash. Note: even if the content
// hash is identical, other metadata may be different, so even
// in this case it makes sense to keep the latest version.
int id = -1;
String time = null;
Document doc = null;
while (td.next()) {
int docid = td.doc();
if (!ir.isDeleted(docid)) {
doc = ir.document(docid);
if (time == null) {
time = doc.get("time");
id = docid;
continue;
}
String dtime = doc.get("time");
// "time" is a DateField, and can be compared lexicographically
if (dtime.compareTo(time) > 0) {
if (id != -1) {
ir.delete(id);
}
time = dtime;
id = docid;
} else {
ir.delete(docid);
}
}
}
} else if (t.field().equals("ch")) {
// Keep only the version of the document with
// the highest score, and then with the shortest url.
int id = -1;
int ul = 0;
float score = 0.0f;
Document doc = null;
while (td.next()) {
int docid = td.doc();
if (!ir.isDeleted(docid)) {
doc = ir.document(docid);
if (ul == 0) {
try {
ul = Integer.parseInt(doc.get("ul"));
score = Float.parseFloat(doc.get("score"));
} catch (Exception e) {};
id = docid;
continue;
}
int dul = 0;
float dscore = 0.0f;
try {
dul = Integer.parseInt(doc.get("ul"));
dscore = Float.parseFloat(doc.get("score"));
} catch (Exception e) {};
int cmp = Float.compare(dscore, score);
if (cmp == 0) {
// equal scores, select the one with shortest url
if (dul < ul) {
if (id != -1) {
ir.delete(id);
}
ul = dul;
id = docid;
} else {
ir.delete(docid);
}
} else if (cmp < 0) {
ir.delete(docid);
} else {
if (id != -1) {
ir.delete(id);
}
ul = dul;
id = docid;
}
}
}
}
}
//
// keep the IndexReader open...
//
LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
stage = SegmentMergeStatus.STAGE_WRITING;
processedRecords = 0L;
Vector outDirs = new Vector();
File outDir = new File(output, SegmentWriter.getNewSegmentName());
outDirs.add(outDir);
LOG.info("* Merging all segments into " + output.getName());
s1 = System.currentTimeMillis();
delta = s1;
nfs.mkdirs(outDir);
SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
LOG.fine(" - opening first output segment in " + outDir.getName());
FetcherOutput fo = new FetcherOutput();
Content co = new Content();
ParseText pt = new ParseText();
ParseData pd = new ParseData();
int outputCnt = 0;
for (int n = 0; n < ir.maxDoc(); n++) {
if (ir.isDeleted(n)) {
//System.out.println("-del");
continue;
}
Document doc = ir.document(n);
String segDoc = doc.get("sd");
int idx = segDoc.indexOf('|');
String segName = segDoc.substring(0, idx);
String docName = segDoc.substring(idx + 1);
SegmentReader sr = (SegmentReader) readers.get(segName);
long docid;
try {
docid = Long.parseLong(docName);
} catch (Exception e) {
continue;
}
try {
// get data from the reader
sr.get(docid, fo, co, pt, pd);
} catch (Throwable thr) {
// don't break the loop, because only one of the segments
// may be corrupted...
LOG.fine(" - corrupt record no. " + docid + " in segment " + sr.segmentDir.getName() + " - skipping.");
continue;
}
sw.append(fo, co, pt, pd);
outputCnt++;
processedRecords++;
if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
if (processedRecords % maxCount == 0) {
sw.close();
outDir = new File(output, SegmentWriter.getNewSegmentName());
LOG.fine(" - starting next output segment in " + outDir.getName());
nfs.mkdirs(outDir);
sw = new SegmentWriter(nfs, outDir, true);
outDirs.add(outDir);
}
}
LOG.info("* Merging took " + (System.currentTimeMillis() - s1) + " ms");
ir.close();
sw.close();
FileUtil.fullyDelete(fsmtIndexDir);
for (Iterator iter = readers.keySet().iterator(); iter.hasNext();) {
SegmentReader sr = (SegmentReader) readers.get(iter.next());
sr.close();
}
if (runIndexer) {
stage = SegmentMergeStatus.STAGE_INDEXING;
totalRecords = outDirs.size();
processedRecords = 0L;