protected void reduce(Gram ngram, Iterable<Gram> values, Context context) throws IOException, InterruptedException {
int[] gramFreq = {-1, -1};
if (ngram.getType() == Gram.Type.UNIGRAM && emitUnigrams) {
DoubleWritable dd = new DoubleWritable(ngram.getFrequency());
Text t = new Text(ngram.getString());
context.write(t, dd);
return;
}
// TODO better way to handle errors? Wouldn't an exception thrown here
// cause hadoop to re-try the job?
String[] gram = new String[2];
for (Gram value : values) {
int pos = value.getType() == Gram.Type.HEAD ? 0 : 1;
if (gramFreq[pos] != -1) {
log.warn("Extra {} for {}, skipping", value.getType(), ngram);
if (value.getType() == Gram.Type.HEAD) {
context.getCounter(Skipped.EXTRA_HEAD).increment(1);
} else {
context.getCounter(Skipped.EXTRA_TAIL).increment(1);
}
return;
}
gram[pos] = value.getString();
gramFreq[pos] = value.getFrequency();
}
if (gramFreq[0] == -1) {
log.warn("Missing head for {}, skipping.", ngram);
context.getCounter(Skipped.MISSING_HEAD).increment(1);
return;
} else if (gramFreq[1] == -1) {
log.warn("Missing tail for {}, skipping", ngram);
context.getCounter(Skipped.MISSING_TAIL).increment(1);
return;
}
int k11 = ngram.getFrequency(); /* a&b */
int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram.getFrequency())); /* !a&!b */
try {
double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
if (llr < minLLRValue) {
context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1);
return;
}
DoubleWritable dd = new DoubleWritable(llr);
Text t = new Text(ngram.getString());
context.write(t, dd);
} catch (IllegalArgumentException ex) {
context.getCounter(Skipped.LLR_CALCULATION_ERROR).increment(1);
log.error("Problem calculating LLR ratio: " + ex.getMessage());