/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.ff.lm;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.lm.buildin_lm.TrieLM;
import org.testng.Assert;
import org.testng.annotations.Test;
/**
* Unit tests for testing ARPA language model class.
*
* @author Lane Schwartz
*/
public class ArpaFileTest {
String arpaFileName;
SymbolTable vocab;
@Test
public void setup() {
vocab = new Vocabulary();
vocab.addTerminal("a");
vocab.addTerminal("because");
vocab.addTerminal("boycott");
vocab.addTerminal("of");
vocab.addTerminal("parliament");
vocab.addTerminal("potato");
vocab.addTerminal("resumption");
vocab.addTerminal("the");
try {
File file = File.createTempFile("testLM", "arpa");
PrintStream out = new PrintStream(file, "UTF-8");
out.println();
out.println("\\data\\");
out.println("ngram 1=8");
out.println("ngram 2=4");
out.println("ngram 3=1");
out.println();
out.println("\\1-grams:");
out.println("-1.992672 a -0.1195484");
out.println("-2.713723 because -0.4665429");
out.println("-4.678545 boycott -0.0902521");
out.println("-1.609573 of -0.1991907");
out.println("-3.875917 parliament -0.1274891");
out.println("-9.753210 potato");
out.println("-4.678545 resumption -0.07945678");
out.println("-1.712444 the -0.1606644");
out.println();
out.println("\\2-grams:");
out.println("-0.3552987 because of -0.03083654");
out.println("-1.403534 of a");
out.println("-0.7507797 of the -0.05237135");
out.println("-0.7266324 resumption of");
out.println("-3.936147 the resumption");
out.println();
out.println("\\3-grams:");
out.println("-0.6309999 because of the");
out.println();
out.println("\\end\\");
out.close();
this.arpaFileName = file.getAbsolutePath();
} catch (IOException e) {
Assert.fail("Unable to create temporary file: " + e.toString());
}
}
@Test(dependsOnMethods={"setup"})
public void testOrder() {
ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
try {
Assert.assertEquals(arpaFile.getOrder(), 3);
} catch (FileNotFoundException e) {
Assert.fail(e.toString());
}
}
@Test(dependsOnMethods={"setup"})
public void testIteration() {
ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
Map<Integer,Integer> counts = new HashMap<Integer,Integer>();
boolean iterationOccurred = false;
for (ArpaNgram ngram : arpaFile) {
iterationOccurred = true;
int order = ngram.order();
// System.err.println("Order = " + order);
int count;
if (counts.containsKey(order)) {
count = counts.get(order) + 1;
} else {
count = 1;
}
counts.put(order, count);
}
Assert.assertTrue(iterationOccurred);
Assert.assertTrue(counts.containsKey(1));
Assert.assertTrue(counts.containsKey(2));
Assert.assertTrue(counts.containsKey(3));
Assert.assertEquals((int) counts.get(1), 8);
Assert.assertEquals((int) counts.get(2), 5);
Assert.assertEquals((int) counts.get(3), 1);
}
@Test(dependsOnMethods={"setup"})
public void testSize() {
ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
Assert.assertEquals(arpaFile.size(), 14);
}
@Test(dependsOnMethods={"setup","testIteration"})
public void testChildren() throws FileNotFoundException {
ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
TrieLM lm = new TrieLM(arpaFile);
// System.err.println(lm.getChildren().size());
Assert.assertNotSame(lm.getChildren().size(), 0);
}
@Test(dependsOnMethods={"setup","testIteration","testChildren"})
public void testTrie() throws FileNotFoundException {
ArpaFile arpaFile = new ArpaFile(arpaFileName, vocab);
TrieLM lm = new TrieLM(arpaFile);
// Test unigrams known to be in the language model
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a")),-1.992672, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because")),-2.713723, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("boycott")),-4.678545, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of")),-1.609573, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("parliament")),-3.875917, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato")),-9.753210, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption")),-4.678545, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the")),-1.712444, 0.000001f);
// Test unigrams known to NOT be in the language model
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("banana")), -JoshuaConfiguration.lm_ceiling_cost, 0.000001f);
// Test bigrams known to be in the language model
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of")), -0.3552987, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the")), -0.7507797, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("resumption of")), -0.7266324, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the resumption")), -3.936147, 0.000001f);
// Test trigrams known to be in the language model
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("because of the")), -0.6309999f, 0.000001f);
// Test bigrams know to NOT be in the language model (but the unigrams are)
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("a boycott")), -4.678545f + -0.1195484f, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of parliament")), -3.875917f + -0.1991907f, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("the potato")), -9.753210f + -0.1606644f, 0.000001f);
Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("potato parliament")), -3.875917f + -0.0f, 0.000001f);
// Test trigrams know to NOT be in the language model (but the bigrams are)
int[] words = vocab.getIDs("because of a");
double f = lm.ngramLogProbability(words);
Assert.assertEquals(f, -1.403534f + -0.03083654f, 0.000001f);
// //Assert.assertEquals(lm.ngramLogProbability(vocab.getIDs("of the parliament")), -3.875917f + -0.05237135f, 0.000001f);
}
}