int[] gramFreq = new int[2];
gramFreq[0] = gramFreq[1] = -1;
if (ngram.getType() == Gram.Type.UNIGRAM && emitUnigrams) {
DoubleWritable dd = new DoubleWritable(ngram.getFrequency());
Text t = new Text(ngram.getString());
output.collect(t, dd);
return;
}
// FIXME: better way to handle errors? Wouldn't an exception thrown here
// cause hadoop to re-try the job?
String[] gram = new String[2];
while (values.hasNext()) {
Gram value = values.next();
int pos = value.getType() == Gram.Type.HEAD ? 0 : 1;
if (gramFreq[pos] != -1) {
log.warn("Extra {} for {}, skipping", value.getType(), ngram);
if (value.getType() == Gram.Type.HEAD) {
reporter.incrCounter(Skipped.EXTRA_HEAD, 1);
} else {
reporter.incrCounter(Skipped.EXTRA_TAIL, 1);
}
return;
}
gram[pos] = value.getString();
gramFreq[pos] = value.getFrequency();
}
if (gramFreq[0] == -1) {
log.warn("Missing head for {}, skipping.", ngram);
reporter.incrCounter(Skipped.MISSING_HEAD, 1);
return;
} else if (gramFreq[1] == -1) {
log.warn("Missing tail for {}, skipping", ngram);
reporter.incrCounter(Skipped.MISSING_TAIL, 1);
return;
}
int k11 = ngram.getFrequency(); /* a&b */
int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram.getFrequency())); /* !a&!b */
try {
double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
if (llr < minLLRValue) {
reporter.incrCounter(Skipped.LESS_THAN_MIN_LLR, 1);
return;
}
DoubleWritable dd = new DoubleWritable(llr);
Text t = new Text(ngram.getString());
output.collect(t, dd);
} catch (IllegalArgumentException ex) {
reporter.incrCounter(Skipped.LLR_CALCULATION_ERROR, 1);
log.error("Problem calculating LLR ratio: " + ex.getMessage());