CrawlDatum lastG = null;
CrawlDatum lastF = null;
CrawlDatum lastSig = null;
Content lastC = null;
ParseData lastPD = null;
ParseText lastPT = null;
String lastGname = null;
String lastFname = null;
String lastSigname = null;
String lastCname = null;
String lastPDname = null;
String lastPTname = null;
TreeMap linked = new TreeMap();
while (values.hasNext()) {
ObjectWritable wrapper = (ObjectWritable)values.next();
Object o = wrapper.get();
if (o instanceof CrawlDatum) {
CrawlDatum val = (CrawlDatum)o;
// check which output dir it belongs to
UTF8 part = (UTF8)val.getMetaData().get(SEGMENT_PART_KEY);
if (part == null)
throw new IOException("Null segment part, key=" + key);
UTF8 uName = (UTF8)val.getMetaData().get(SEGMENT_NAME_KEY);
if (uName == null)
throw new IOException("Null segment name, key=" + key);
String name = uName.toString();
String partString = part.toString();
if (partString.equals(CrawlDatum.GENERATE_DIR_NAME)) {
if (lastG == null) {
lastG = val;
lastGname = name;
} else {
// take newer
if (lastGname.compareTo(name) < 0) {
lastG = val;
lastGname = name;
}
}
} else if (partString.equals(CrawlDatum.FETCH_DIR_NAME)) {
if (lastF == null) {
lastF = val;
lastFname = name;
} else {
// take newer
if (lastFname.compareTo(name) < 0) {
lastF = val;
lastFname = name;
}
}
} else if (partString.equals(CrawlDatum.PARSE_DIR_NAME)) {
if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
if (lastSig == null) {
lastSig = val;
lastSigname = name;
} else {
// take newer
if (lastSigname.compareTo(name) < 0) {
lastSig = val;
lastSigname = name;
}
}
continue;
}
// collect all LINKED values from the latest segment
ArrayList segLinked = (ArrayList)linked.get(name);
if (segLinked == null) {
segLinked = new ArrayList();
linked.put(name, segLinked);
}
segLinked.add(val);
} else {
throw new IOException("Cannot determine segment part: " + partString);
}
} else if (o instanceof Content) {
String name = ((Content)o).getMetadata().get(SEGMENT_NAME_KEY.toString());
if (lastC == null) {
lastC = (Content)o;
lastCname = name;
} else {
if (lastCname.compareTo(name) < 0) {
lastC = (Content)o;
lastCname = name;
}
}
} else if (o instanceof ParseData) {
String name = ((ParseData)o).getParseMeta().get(SEGMENT_NAME_KEY.toString());
if (lastPD == null) {
lastPD = (ParseData)o;
lastPDname = name;
} else {
if (lastPDname.compareTo(name) < 0) {
lastPD = (ParseData)o;
lastPDname = name;
}
}
} else if (o instanceof ParseText) {
String text = ((ParseText)o).getText();
String name = null;
int idx = text.indexOf(nameMarker, nameMarker.length());
if (idx != -1) {
name = text.substring(nameMarker.length(), idx);
} else {
throw new IOException("Missing segment name marker in ParseText, key " + key + ": " + text);
}
if (lastPT == null) {
lastPT = (ParseText)o;
lastPTname = name;
} else {
if (lastPTname.compareTo(name) < 0) {
lastPT = (ParseText)o;
lastPTname = name;
}
}
}
}
curCount++;
UTF8 sliceName = null;
ObjectWritable wrapper = new ObjectWritable();
if (sliceSize > 0) {
sliceName = new UTF8(String.valueOf(curCount / sliceSize));
}
// now output the latest values
if (lastG != null) {
if (sliceName != null) {
lastG.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
}
wrapper.set(lastG);
output.collect(key, wrapper);
}
if (lastF != null) {
if (sliceName != null) {
lastF.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
}
wrapper.set(lastF);
output.collect(key, wrapper);
}
if (lastSig != null) {
if (sliceName != null) {
lastSig.getMetaData().put(SEGMENT_SLICE_KEY, sliceName);
}
wrapper.set(lastSig);
output.collect(key, wrapper);
}
if (lastC != null) {
if (sliceName != null) {
lastC.getMetadata().set(sliceMarker, sliceName.toString());
}
wrapper.set(lastC);
output.collect(key, wrapper);
}
if (lastPD != null) {
if (sliceName != null) {
lastPD.getParseMeta().set(sliceMarker, sliceName.toString());
}
wrapper.set(lastPD);
output.collect(key, wrapper);
}
if (lastPT != null) {
if (sliceName != null) {
lastPT = new ParseText(sliceMarker + sliceName + sliceMarker
+ lastPT.getText());
}
wrapper.set(lastPT);
output.collect(key, wrapper);
}
if (linked.size() > 0) {