Package joshua.decoder.ff.lm

Source Code of joshua.decoder.ff.lm.ArpaFileTest

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.ff.lm;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;

import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.lm.buildin_lm.TrieLM;

import org.testng.Assert;
import org.testng.annotations.Test;

/**
* Unit tests for testing ARPA language model class.
*
* @author Lane Schwartz
*/
public class ArpaFileTest {

  String arpaFileName;
  SymbolTable vocab;
 
  @Test
  public void setup() {
   
    vocab = new Vocabulary();
    vocab.addTerminal("a");
    vocab.addTerminal("because");
    vocab.addTerminal("boycott");
    vocab.addTerminal("of");
    vocab.addTerminal("parliament");
    vocab.addTerminal("potato");
    vocab.addTerminal("resumption");
    vocab.addTerminal("the");
   
    try {
      File file = File.createTempFile("testLM", "arpa");
      PrintStream out = new PrintStream(file, "UTF-8");
     
      out.println();
      out.println("\\data\\");
      out.println("ngram 1=8");
      out.println("ngram 2=4");
      out.println("ngram 3=1");
      out.println();
     
      out.println("\\1-grams:");
      out.println("-1.992672       a       -0.1195484");
      out.println("-2.713723       because -0.4665429");
      out.println("-4.678545       boycott -0.0902521");
      out.println("-1.609573       of      -0.1991907");
      out.println("-3.875917       parliament      -0.1274891");
      out.println("-9.753210       potato");
      out.println("-4.678545       resumption      -0.07945678");
      out.println("-1.712444       the     -0.1606644");
     
      out.println();
      out.println("\\2-grams:");
      out.println("-0.3552987      because of      -0.03083654");
      out.println("-1.403534       of a");
      out.println("-0.7507797      of the  -0.05237135");
      out.println("-0.7266324      resumption of");
      out.println("-3.936147       the resumption");
     
      out.println();
      out.println("\\3-grams:");
      out.println("-0.6309999      because of the");
      out.println();
     
      out.println("\\end\\");
     
      out.close();
      this.arpaFileName = file.getAbsolutePath();
     
    } catch (IOException e) {
      Assert.fail("Unable to create temporary file: " + e.toString());
    }
   
  }
 
  @Test(dependsOnMethods={"setup"})
  public void testOrder() {
    ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
   
    try {
      Assert.assertEquals(arpaFile.getOrder(), 3);
    } catch (FileNotFoundException e) {
      Assert.fail(e.toString());
    }
  }
 
  @Test(dependsOnMethods={"setup"})
  public void testIteration() {
   
    ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
   
    Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
   
    boolean iterationOccurred = false;
   
    for (ArpaNgram ngram : arpaFile) {
   
      iterationOccurred = true;
     
      int order = ngram.order();
//      System.err.println("Order = " + order);
     
      int count;
      if (counts.containsKey(order)) {
        count = counts.get(order) + 1;
      } else {
        count = 1;
      }
     
      counts.put(order, count);

    }
   
    Assert.assertTrue(iterationOccurred);

    Assert.assertTrue(counts.containsKey(1));
    Assert.assertTrue(counts.containsKey(2));
    Assert.assertTrue(counts.containsKey(3));
   
    Assert.assertEquals((int) counts.get(1), 8);
    Assert.assertEquals((int) counts.get(2), 5);
    Assert.assertEquals((int) counts.get(3), 1);
   
  }
 
  @Test(dependsOnMethods={"setup"})
  public void testSize() {
    ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
   
    Assert.assertEquals(arpaFile.size(), 14);
  }
 
  @Test(dependsOnMethods={"setup","testIteration"})
  public void testChildren() throws FileNotFoundException {
    ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
   
    TrieLM lm = new TrieLM(arpaFile);
//    System.err.println(lm.getChildren().size());
    Assert.assertNotSame(lm.getChildren().size(), 0);
  }
 
  @Test(dependsOnMethods={"setup","testIteration","testChildren"})
  public void testTrie() throws FileNotFoundException {
    ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
   
    TrieLM lm = new TrieLM(arpaFile);

    // Test unigrams known to be in the language model
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a")),-1.992672, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because")),-2.713723, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("boycott")),-4.678545, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of")),-1.609573, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("parliament")),-3.875917, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato")),-9.753210, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption")),-4.678545, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the")),-1.712444, 0.000001f);
   
    // Test unigrams known to NOT be in the language model
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("banana")), -JoshuaConfiguration.lm_ceiling_cost, 0.000001f);
   
   
    // Test bigrams known to be in the language model
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of")), -0.3552987, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the")), -0.7507797, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption of")), -0.7266324, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the resumption")), -3.936147, 0.000001f);
   
    // Test trigrams known to be in the language model
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of the")), -0.6309999f, 0.000001f);
 
 
    // Test bigrams know to NOT be in the language model (but the unigrams are)
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a boycott")), -4.678545f + -0.1195484f, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of parliament")), -3.875917f + -0.1991907f, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the potato")), -9.753210f + -0.1606644f, 0.000001f);
    Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato parliament")), -3.875917f + -0.0f, 0.000001f);
   
    // Test trigrams know to NOT be in the language model (but the bigrams are)
    int[] words = vocab.getIDs("because of a");
    double f = lm.ngramLogProbability(words);
    Assert.assertEquals(f, -1.403534f + -0.03083654f, 0.000001f);
//    //Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the parliament")), -3.875917f + -0.05237135f, 0.000001f);
   
  }
}
TOP

Related Classes of joshua.decoder.ff.lm.ArpaFileTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.