Package ch.akuhn.hapax.index

Source Code of ch.akuhn.hapax.index.LogLikelihood$Comparison

package ch.akuhn.hapax.index;

import java.util.ArrayList;
import java.util.Collections;

import org.junit.Test;

import ch.akuhn.hapax.corpus.Terms;

public class LogLikelihood implements Comparable<LogLikelihood> {

  private Terms t1, t2;
  private String term;
  private double logL;
  private boolean p1HigherThenP2;

  public LogLikelihood(Terms t1, Terms t2, String term) {
    this.t1 = t1;
    this.t2 = t2;
    this.logL = getDunning(term);
    this.term = term;
    this.p1HigherThenP2 = isP1HigherThenP2(term);
  }

  private boolean isP1HigherThenP2(String term2) {
    double k1 = t1.occurrences(term);
    double k2 = t2.occurrences(term);
    double n1 = t1.size();
    double n2 = t2.size();
    return (k1 / n1) > (k2 / n2);
  }

  public double value() {
    return p1HigherThenP2 ? logL : -logL;
  }

  /**
   *
   * @see Ted Dunning, 1993
   *
   */
  private double getDunning(String t) {
    double k1 = t1.occurrences(t);
    double k2 = t2.occurrences(t);
    double n1 = t1.size();
    double n2 = t2.size();
    double p1 = k1 / n1;
    double p2 = k2 / n2;
    double p = (k1 + k2) / (n1 + n2);
    return 2 * (logL(p1, k1, n1) + logL(p2, k2, n2) - logL(p, k1, n1) - logL(p, k2, n2));
  }

  private double logL(double p, double k, double n) {
    return (k == 0 ? 0 : k * Math.log(p)) + ((n - k) == 0 ? 0 : (n - k) * Math.log(1 - p));
  }

  // @Override
  public int compareTo(LogLikelihood other) {
    return (int) (other.value() - this.value());
  }

  @Override
  public String toString() {
    return String.format("logL(%s) = %.3f", term, value());
  }

  public static Comparison compare(Terms t1, Terms t2) {
    Comparison comparison = new Comparison();
    for (String each: new Terms(t1, t2).elementSet()) {
      comparison.add(new LogLikelihood(t1, t2, each));
    }
    Collections.sort(comparison);
    return comparison;
  }

  @SuppressWarnings("serial")
  public static class Comparison extends ArrayList<LogLikelihood> {

    public Comparison withThreshold(int logL) {
      Comparison selection = new Comparison();
      for (LogLikelihood each: this) {
        if (each.isAboveThreshold(logL)) selection.add(each);
      }
      return selection;
    }

  }

  public boolean isAboveThreshold(int threshold) {
    return threshold >= 0 ? this.value() >= threshold : this.value() <= threshold;
  }

  public static class Examples {

    @Test
    public void shouldComputeLogLikelihood() {
      Terms all = new Terms("A A A A A B C C C D D");
      Terms doc = new Terms("A A B B X");
      for (String each: doc.elements()) {
        LogLikelihood loglr = new LogLikelihood(all, doc, each);
        System.out.println(loglr);
      }
    }

  }

}
TOP

Related Classes of ch.akuhn.hapax.index.LogLikelihood$Comparison

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.