package ch.akuhn.hapax.index;
import java.util.ArrayList;
import java.util.Collections;
import org.junit.Test;
import ch.akuhn.hapax.corpus.Terms;
public class LogLikelihood implements Comparable<LogLikelihood> {
private Terms t1, t2;
private String term;
private double logL;
private boolean p1HigherThenP2;
public LogLikelihood(Terms t1, Terms t2, String term) {
this.t1 = t1;
this.t2 = t2;
this.logL = getDunning(term);
this.term = term;
this.p1HigherThenP2 = isP1HigherThenP2(term);
}
private boolean isP1HigherThenP2(String term2) {
double k1 = t1.occurrences(term);
double k2 = t2.occurrences(term);
double n1 = t1.size();
double n2 = t2.size();
return (k1 / n1) > (k2 / n2);
}
public double value() {
return p1HigherThenP2 ? logL : -logL;
}
/**
*
* @see Ted Dunning, 1993
*
*/
private double getDunning(String t) {
double k1 = t1.occurrences(t);
double k2 = t2.occurrences(t);
double n1 = t1.size();
double n2 = t2.size();
double p1 = k1 / n1;
double p2 = k2 / n2;
double p = (k1 + k2) / (n1 + n2);
return 2 * (logL(p1, k1, n1) + logL(p2, k2, n2) - logL(p, k1, n1) - logL(p, k2, n2));
}
private double logL(double p, double k, double n) {
return (k == 0 ? 0 : k * Math.log(p)) + ((n - k) == 0 ? 0 : (n - k) * Math.log(1 - p));
}
// @Override
public int compareTo(LogLikelihood other) {
return (int) (other.value() - this.value());
}
@Override
public String toString() {
return String.format("logL(%s) = %.3f", term, value());
}
public static Comparison compare(Terms t1, Terms t2) {
Comparison comparison = new Comparison();
for (String each: new Terms(t1, t2).elementSet()) {
comparison.add(new LogLikelihood(t1, t2, each));
}
Collections.sort(comparison);
return comparison;
}
@SuppressWarnings("serial")
public static class Comparison extends ArrayList<LogLikelihood> {
public Comparison withThreshold(int logL) {
Comparison selection = new Comparison();
for (LogLikelihood each: this) {
if (each.isAboveThreshold(logL)) selection.add(each);
}
return selection;
}
}
public boolean isAboveThreshold(int threshold) {
return threshold >= 0 ? this.value() >= threshold : this.value() <= threshold;
}
public static class Examples {
@Test
public void shouldComputeLogLikelihood() {
Terms all = new Terms("A A A A A B C C C D D");
Terms doc = new Terms("A A B B X");
for (String each: doc.elements()) {
LogLikelihood loglr = new LogLikelihood(all, doc, each);
System.out.println(loglr);
}
}
}
}