public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception {
String segmentName = Generator.generateSegmentName();
if (LOG.isInfoEnabled()) {
LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
}
JobConf job = new NutchJob(getConf());
job.setJobName("mergesegs " + out + "/" + segmentName);
job.setBoolean("segment.merger.filter", filter);
job.setBoolean("segment.merger.normalizer", normalize);
job.setLong("segment.merger.slice", slice);
job.set("segment.merger.segmentName", segmentName);
FileSystem fs = FileSystem.get(getConf());
// prepare the minimal common set of input dirs
boolean g = true;
boolean f = true;
boolean p = true;
boolean c = true;
boolean pd = true;
boolean pt = true;
for (int i = 0; i < segs.length; i++) {
if (!fs.exists(segs[i])) {
if (LOG.isWarnEnabled()) {
LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
}
segs[i] = null;
continue;
}
if (LOG.isInfoEnabled()) {
LOG.info("SegmentMerger: adding " + segs[i]);
}
Path cDir = new Path(segs[i], Content.DIR_NAME);
Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
c = c && fs.exists(cDir);
g = g && fs.exists(gDir);
f = f && fs.exists(fDir);
p = p && fs.exists(pDir);
pd = pd && fs.exists(pdDir);
pt = pt && fs.exists(ptDir);
}
StringBuffer sb = new StringBuffer();
if (c) sb.append(" " + Content.DIR_NAME);
if (g) sb.append(" " + CrawlDatum.GENERATE_DIR_NAME);
if (f) sb.append(" " + CrawlDatum.FETCH_DIR_NAME);
if (p) sb.append(" " + CrawlDatum.PARSE_DIR_NAME);
if (pd) sb.append(" " + ParseData.DIR_NAME);
if (pt) sb.append(" " + ParseText.DIR_NAME);
if (LOG.isInfoEnabled()) {
LOG.info("SegmentMerger: using segment data from:" + sb.toString());
}
for (int i = 0; i < segs.length; i++) {
if (segs[i] == null) continue;
if (g) {
Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
FileInputFormat.addInputPath(job, gDir);
}
if (c) {
Path cDir = new Path(segs[i], Content.DIR_NAME);
FileInputFormat.addInputPath(job, cDir);
}
if (f) {
Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
FileInputFormat.addInputPath(job, fDir);
}
if (p) {
Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
FileInputFormat.addInputPath(job, pDir);
}
if (pd) {
Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
FileInputFormat.addInputPath(job, pdDir);
}
if (pt) {
Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
FileInputFormat.addInputPath(job, ptDir);
}
}
job.setInputFormat(ObjectInputFormat.class);
job.setMapperClass(SegmentMerger.class);
job.setReducerClass(SegmentMerger.class);
FileOutputFormat.setOutputPath(job, out);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MetaWrapper.class);
job.setOutputFormat(SegmentOutputFormat.class);
setConf(job);
JobClient.runJob(job);
}