/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import java.util.logging.Logger;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.indexer.IndexSegment;
import org.apache.nutch.io.MD5Hash;
import org.apache.nutch.fs.*;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.segment.SegmentReader;
import org.apache.nutch.segment.SegmentWriter;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
/**
* This class cleans up accumulated segments data, and merges them into a single
* (or optionally multiple) segment(s), with no duplicates in it.
*
* <p>
* There are no prerequisites for its correct
* operation except for a set of already fetched segments (they don't have to
* contain parsed content, only fetcher output is required). This tool does not
* use DeleteDuplicates, but creates its own "master" index of all pages in all
* segments. Then it walks sequentially through this index and picks up only
* most recent versions of pages for every unique value of url or hash.
* </p>
* <p>If some of the input segments are corrupted, this tool will attempt to
* repair them, using
* {@link org.apache.nutch.segment.SegmentReader#fixSegment(NutchFileSystem, File, boolean, boolean, boolean, boolean)} method.</p>
* <p>Output segment can be optionally split on the fly into several segments of fixed
* length.</p>
* <p>
* The newly created segment(s) can be then optionally indexed, so that it can be
* either merged with more new segments, or used for searching as it is.
* </p>
* <p>
* Old segments may be optionally removed, because all needed data has already
* been copied to the new merged segment. NOTE: this tool will remove also all
* corrupted input segments, which are not useable anyway - however, this option
* may be dangerous if you inadvertently included non-segment directories as
* input...</p>
* <p>
* You may want to run SegmentMergeTool instead of following the manual procedures,
* with all options turned on, i.e. to merge segments into the output segment(s),
* index it, and then delete the original segments data.
* </p>
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class SegmentMergeTool implements Runnable {
public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.tools.SegmentMergeTool");
/** Log progress update every LOG_STEP items. */
public static int LOG_STEP = 20000;
/** Temporary de-dup index size. Larger indexes tend to slow down indexing.
* Too many indexes slow down the subsequent index merging. It's a tradeoff value...
*/
public static int INDEX_SIZE = 250000;
public static int INDEX_MERGE_FACTOR = 30;
public static int INDEX_MIN_MERGE_DOCS = 100;
private boolean boostByLinkCount =
NutchConf.get().getBoolean("indexer.boost.by.link.count", false);
private float scorePower = NutchConf.get().getFloat("indexer.score.power", 0.5f);
private NutchFileSystem nfs = null;
private File[] segments = null;
private int stage = SegmentMergeStatus.STAGE_OPENING;
private long totalRecords = 0L;
private long processedRecords = 0L;
private long start = 0L;
private long maxCount = Long.MAX_VALUE;
private File output = null;
private List segdirs = null;
private List allsegdirs = null;
private boolean runIndexer = false;
private boolean delSegs = false;
private HashMap readers = new HashMap();
/**
* Create a SegmentMergeTool.
* @param nfs filesystem
* @param segments list of input segments
* @param output output directory, where output segments will be created
* @param maxCount maximum number of records per output segment. If this
* value is 0, then the default value {@link Long#MAX_VALUE} is used.
* @param runIndexer run indexer on output segment(s)
* @param delSegs delete input segments when finished
* @throws Exception
*/
public SegmentMergeTool(NutchFileSystem nfs, File[] segments, File output, long maxCount, boolean runIndexer, boolean delSegs) throws Exception {
this.nfs = nfs;
this.segments = segments;
this.runIndexer = runIndexer;
this.delSegs = delSegs;
if (maxCount > 0) this.maxCount = maxCount;
allsegdirs = Arrays.asList(segments);
this.output = output;
if (nfs.exists(output)) {
if (!nfs.isDirectory(output))
throw new Exception("Output is not a directory: " + output);
} else nfs.mkdirs(output);
}
public static class SegmentMergeStatus {
public static final int STAGE_OPENING = 0;
public static final int STAGE_MASTERIDX = 1;
public static final int STAGE_MERGEIDX = 2;
public static final int STAGE_DEDUP = 3;
public static final int STAGE_WRITING = 4;
public static final int STAGE_INDEXING = 5;
public static final int STAGE_DELETING = 6;
public static final String[] stages = {
"opening input segments",
"creating master index",
"merging sub-indexes",
"deduplicating",
"writing output segment(s)",
"indexing output segment(s)",
"deleting input segments"
};
public int stage;
public File[] inputSegments;
public long startTime, curTime;
public long totalRecords;
public long processedRecords;
public SegmentMergeStatus() {};
public SegmentMergeStatus(int stage, File[] inputSegments, long startTime,
long totalRecords, long processedRecords) {
this.stage = stage;
this.inputSegments = inputSegments;
this.startTime = startTime;
this.curTime = System.currentTimeMillis();
this.totalRecords = totalRecords;
this.processedRecords = processedRecords;
}
}
public SegmentMergeStatus getStatus() {
SegmentMergeStatus status = new SegmentMergeStatus(stage, segments, start,
totalRecords, processedRecords);
return status;
}
/** Run the tool, periodically reporting progress. */
public void run() {
start = System.currentTimeMillis();
stage = SegmentMergeStatus.STAGE_OPENING;
long delta;
LOG.info("* Opening " + allsegdirs.size() + " segments:");
try {
segdirs = new ArrayList();
// open all segments
for (int i = 0; i < allsegdirs.size(); i++) {
File dir = (File) allsegdirs.get(i);
SegmentReader sr = null;
try {
// try to autofix it if corrupted...
sr = new SegmentReader(nfs, dir, true);
} catch (Exception e) {
// this segment is hosed beyond repair, don't use it
LOG.warning("* Segment " + dir.getName() + " is corrupt beyond repair; skipping it.");
continue;
}
segdirs.add(dir);
totalRecords += sr.size;
LOG.info(" - segment " + dir.getName() + ": " + sr.size + " records.");
readers.put(dir.getName(), sr);
}
long total = totalRecords;
LOG.info("* TOTAL " + total + " input records in " + segdirs.size() + " segments.");
LOG.info("* Creating master index...");
stage = SegmentMergeStatus.STAGE_MASTERIDX;
// XXX Note that Lucene indexes don't work with NutchFileSystem for now.
// XXX For now always assume LocalFileSystem here...
Vector masters = new Vector();
File fsmtIndexDir = new File(output, ".fastmerge_index");
File masterDir = new File(fsmtIndexDir, "0");
if (!masterDir.mkdirs()) {
LOG.severe("Could not create a master index dir: " + masterDir);
return;
}
masters.add(masterDir);
IndexWriter iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
iw.setUseCompoundFile(false);
iw.mergeFactor = INDEX_MERGE_FACTOR;
iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
long s1 = System.currentTimeMillis();
Iterator it = readers.values().iterator();
processedRecords = 0L;
delta = System.currentTimeMillis();
while (it.hasNext()) {
SegmentReader sr = (SegmentReader) it.next();
String name = sr.segmentDir.getName();
FetcherOutput fo = new FetcherOutput();
for (long i = 0; i < sr.size; i++) {
try {
if (!sr.get(i, fo, null, null, null)) break;
Document doc = new Document();
// compute boost
float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
scorePower, boostByLinkCount, fo.getAnchors().length);
doc.add(new Field("sd", name + "|" + i, true, false, false));
doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
doc.add(new Field("score", boost + "", true, false, false));
doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false));
iw.addDocument(doc);
processedRecords++;
if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) {
iw.optimize();
iw.close();
LOG.info(" - creating next subindex...");
masterDir = new File(fsmtIndexDir, "" + masters.size());
if (!masterDir.mkdirs()) {
LOG.severe("Could not create a master index dir: " + masterDir);
return;
}
masters.add(masterDir);
iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
iw.setUseCompoundFile(false);
iw.mergeFactor = INDEX_MERGE_FACTOR;
iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
}
} catch (Throwable t) {
// we can assume the data is invalid from now on - break here
LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records");
break;
}
}
}
iw.optimize();
LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms");
s1 = System.currentTimeMillis();
// merge all other indexes using the latest IndexWriter (still open):
if (masters.size() > 1) {
LOG.info(" - merging subindexes...");
stage = SegmentMergeStatus.STAGE_MERGEIDX;
IndexReader[] ireaders = new IndexReader[masters.size() - 1];
for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File)masters.get(i));
iw.addIndexes(ireaders);
for (int i = 0; i < masters.size() - 1; i++) {
ireaders[i].close();
FileUtil.fullyDelete((File)masters.get(i));
}
}
iw.close();
LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
LOG.info("* Removing duplicate entries...");
stage = SegmentMergeStatus.STAGE_DEDUP;
IndexReader ir = IndexReader.open(masterDir);
int i = 0;
long cnt = 0L;
processedRecords = 0L;
s1 = System.currentTimeMillis();
delta = s1;
TermEnum te = ir.terms();
while(te.next()) {
Term t = te.term();
if (t == null) continue;
if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
cnt++;
processedRecords = cnt / 2;
if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
// Enumerate all docs with the same URL hash or content hash
TermDocs td = ir.termDocs(t);
if (td == null) continue;
if (t.field().equals("uh")) {
// Keep only the latest version of the document with
// the same url hash. Note: even if the content
// hash is identical, other metadata may be different, so even
// in this case it makes sense to keep the latest version.
int id = -1;
String time = null;
Document doc = null;
while (td.next()) {
int docid = td.doc();
if (!ir.isDeleted(docid)) {
doc = ir.document(docid);
if (time == null) {
time = doc.get("time");
id = docid;
continue;
}
String dtime = doc.get("time");
// "time" is a DateField, and can be compared lexicographically
if (dtime.compareTo(time) > 0) {
if (id != -1) {
ir.delete(id);
}
time = dtime;
id = docid;
} else {
ir.delete(docid);
}
}
}
} else if (t.field().equals("ch")) {
// Keep only the version of the document with
// the highest score, and then with the shortest url.
int id = -1;
int ul = 0;
float score = 0.0f;
Document doc = null;
while (td.next()) {
int docid = td.doc();
if (!ir.isDeleted(docid)) {
doc = ir.document(docid);
if (ul == 0) {
try {
ul = Integer.parseInt(doc.get("ul"));
score = Float.parseFloat(doc.get("score"));
} catch (Exception e) {};
id = docid;
continue;
}
int dul = 0;
float dscore = 0.0f;
try {
dul = Integer.parseInt(doc.get("ul"));
dscore = Float.parseFloat(doc.get("score"));
} catch (Exception e) {};
int cmp = Float.compare(dscore, score);
if (cmp == 0) {
// equal scores, select the one with shortest url
if (dul < ul) {
if (id != -1) {
ir.delete(id);
}
ul = dul;
id = docid;
} else {
ir.delete(docid);
}
} else if (cmp < 0) {
ir.delete(docid);
} else {
if (id != -1) {
ir.delete(id);
}
ul = dul;
id = docid;
}
}
}
}
}
//
// keep the IndexReader open...
//
LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
stage = SegmentMergeStatus.STAGE_WRITING;
processedRecords = 0L;
Vector outDirs = new Vector();
File outDir = new File(output, SegmentWriter.getNewSegmentName());
outDirs.add(outDir);
LOG.info("* Merging all segments into " + output.getName());
s1 = System.currentTimeMillis();
delta = s1;
nfs.mkdirs(outDir);
SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
LOG.fine(" - opening first output segment in " + outDir.getName());
FetcherOutput fo = new FetcherOutput();
Content co = new Content();
ParseText pt = new ParseText();
ParseData pd = new ParseData();
int outputCnt = 0;
for (int n = 0; n < ir.maxDoc(); n++) {
if (ir.isDeleted(n)) {
//System.out.println("-del");
continue;
}
Document doc = ir.document(n);
String segDoc = doc.get("sd");
int idx = segDoc.indexOf('|');
String segName = segDoc.substring(0, idx);
String docName = segDoc.substring(idx + 1);
SegmentReader sr = (SegmentReader) readers.get(segName);
long docid;
try {
docid = Long.parseLong(docName);
} catch (Exception e) {
continue;
}
try {
// get data from the reader
sr.get(docid, fo, co, pt, pd);
} catch (Throwable thr) {
// don't break the loop, because only one of the segments
// may be corrupted...
LOG.fine(" - corrupt record no. " + docid + " in segment " + sr.segmentDir.getName() + " - skipping.");
continue;
}
sw.append(fo, co, pt, pd);
outputCnt++;
processedRecords++;
if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
LOG.info(" Processed " + processedRecords + " records (" +
(float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
delta = System.currentTimeMillis();
}
if (processedRecords % maxCount == 0) {
sw.close();
outDir = new File(output, SegmentWriter.getNewSegmentName());
LOG.fine(" - starting next output segment in " + outDir.getName());
nfs.mkdirs(outDir);
sw = new SegmentWriter(nfs, outDir, true);
outDirs.add(outDir);
}
}
LOG.info("* Merging took " + (System.currentTimeMillis() - s1) + " ms");
ir.close();
sw.close();
FileUtil.fullyDelete(fsmtIndexDir);
for (Iterator iter = readers.keySet().iterator(); iter.hasNext();) {
SegmentReader sr = (SegmentReader) readers.get(iter.next());
sr.close();
}
if (runIndexer) {
stage = SegmentMergeStatus.STAGE_INDEXING;
totalRecords = outDirs.size();
processedRecords = 0L;
LOG.info("* Creating new segment index(es)...");
File workingDir = new File(output, "indexsegment-workingdir");
for (int k = 0; k < outDirs.size(); k++) {
processedRecords++;
if (workingDir.exists()) {
FileUtil.fullyDelete(workingDir);
}
IndexSegment indexer = new IndexSegment(nfs, Integer.MAX_VALUE,
(File)outDirs.get(k), workingDir);
indexer.indexPages();
FileUtil.fullyDelete(workingDir);
}
}
if (delSegs) {
// This deletes also all corrupt segments, which are
// unusable anyway
stage = SegmentMergeStatus.STAGE_DELETING;
totalRecords = allsegdirs.size();
processedRecords = 0L;
LOG.info("* Deleting old segments...");
for (int k = 0; k < allsegdirs.size(); k++) {
processedRecords++;
FileUtil.fullyDelete((File) allsegdirs.get(k));
}
}
delta = System.currentTimeMillis() - start;
float eps = (float) total / (float) (delta / 1000);
LOG.info("Finished SegmentMergeTool: INPUT: " + total + " -> OUTPUT: " + outputCnt + " entries in "
+ ((float) delta / 1000f) + " s (" + eps + " entries/sec).");
} catch (Exception e) {
e.printStackTrace();
LOG.severe(e.getMessage());
}
}
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Too few arguments.\n");
usage();
System.exit(-1);
}
NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
boolean runIndexer = false;
boolean delSegs = false;
long maxCount = Long.MAX_VALUE;
String segDir = null;
File output = null;
Vector dirs = new Vector();
for (int i = 0; i < args.length; i++) {
if (args[i] == null) continue;
if (args[i].equals("-o")) {
if (args.length > i + 1) {
output = new File(args[++i]);
continue;
} else {
LOG.severe("Required value of '-o' argument missing.\n");
usage();
return;
}
} else if (args[i].equals("-i")) {
runIndexer = true;
} else if (args[i].equals("-cm")) {
LOG.warning("'-cm' option obsolete - ignored.");
} else if (args[i].equals("-max")) {
String cnt = args[++i];
try {
maxCount = Long.parseLong(cnt);
} catch (Exception e) {
LOG.warning("Invalid count '" + cnt + "', setting to Long.MAX_VALUE.");
}
} else if (args[i].equals("-ds")) {
delSegs = true;
} else if (args[i].equals("-dir")) {
segDir = args[++i];
} else dirs.add(new File(args[i]));
}
if (segDir != null) {
File sDir = new File(segDir);
if (!sDir.exists() || !sDir.isDirectory()) {
LOG.warning("Invalid path: " + sDir);
} else {
File[] files = sDir.listFiles(new FileFilter() {
public boolean accept(File f) {
return f.isDirectory();
}
});
if (files != null && files.length > 0) {
for (int i = 0; i < files.length; i++) dirs.add(files[i]);
}
}
}
if (dirs.size() == 0) {
LOG.severe("No input segments.");
return;
}
if (output == null) output = ((File)dirs.get(0)).getParentFile();
SegmentMergeTool st = new SegmentMergeTool(nfs, (File[])dirs.toArray(new File[0]),
output, maxCount, runIndexer, delSegs);
st.run();
}
private static void usage() {
System.err.println("SegmentMergeTool (-local | -nfs ...) (-dir <input_segments_dir> | seg1 seg2 ...) [-o <output_segments_dir>] [-max count] [-i] [-ds]");
System.err.println("\t-dir <input_segments_dir>\tpath to directory containing input segments");
System.err.println("\tseg1 seg2 seg3\t\tindividual paths to input segments");
System.err.println("\t-o <output_segment_dir>\t(optional) path to directory which will\n\t\t\t\tcontain output segment(s).\n\t\t\tNOTE: If not present, the original segments path will be used.");
System.err.println("\t-max count\t(optional) output multiple segments, each with maximum 'count' entries");
System.err.println("\t-i\t\t(optional) index the output segment when finished merging.");
System.err.println("\t-ds\t\t(optional) delete the original input segments when finished.");
System.err.println();
}
}