Counter dupCounter = context.getCounter("app", "duplicate report");
int edgeCount = 0, dups = 0, numReports = 0;
for(Text rep_text: values) {
Report r = Report.createFromString(rep_text.toString());
numReports++;
if(numReports < MAX_IN_MEMORY_REPORTS) {
if(reports.containsKey(r.getMetadata().getOpIdString()))
dups++;
reports.put(r.getMetadata().getOpIdString(), r);
} else if(numReports == MAX_IN_MEMORY_REPORTS) {
//bail out, prepare to do an external sort.
return;
} else
;
// do the external sort
}
HashMap<String, Integer> counts = new HashMap<String, Integer>();
Queue<Report> zeroInlinkReports = new LinkedList<Report>();
reportCounter.increment(reports.size());
dupCounter.increment(dups);
//FIXME: could usefully compare reports.size() with numReports;
//that would measure duplicate reports
//increment link counts for children
for(Report r: reports.values()){
String myOpID = r.getMetadata().getOpIdString();
int parentCount = 0;
for(String inLink: r.get("Edge")) {
//sanitize data from old, nonconformant C++ implementation
if(inLink.contains(","))
inLink = inLink.substring(0, inLink.indexOf(','));
Report parent = reports.get(inLink);
if(parent != null) {
parent.put(OUTLINK_FIELD, myOpID);
parentCount++;
edgeCount++;
}
else { //no match
if(!inLink.equals("0000000000000000")) {
log.info("no sign of parent: " + inLink);
badEdgeCounter.increment(1);
}
//else quietly suppress
}
}
//if there weren't any parents, we can dequeue
if(parentCount == 0)
zeroInlinkReports.add(r);
else
counts.put(myOpID, parentCount);
}
log.debug(taskIDString+": " + edgeCount + " total edges");
edgeCounter.increment(edgeCount);
//at this point, we have a map from metadata to report, and also
//from report op ID to inlink count.
//next step is to do a topological sort.
Text[] finalOutput = new Text[reports.size()];
log.debug(taskIDString+": expecting to sort " + finalOutput.length + " reports");
int i=0;
while(!zeroInlinkReports.isEmpty()) {
Report r = zeroInlinkReports.poll();
if(r == null) {
log.warn(taskIDString+": poll returned null but list not empty. This is probably a bug"
+ " fairly deep down");
break;
}
finalOutput[i++] = new Text(r.toString());
List<String> outLinks = r.get(OUTLINK_FIELD);
if(outLinks != null) {
for(String outLink: outLinks) {
Integer oldCount = counts.get(outLink);
if(oldCount == null) {
oldCount = 0; //FIXME: can this happen?