Source Code of joshua.prefix_tree.LMAdaptingRuleExtractor

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.prefix_tree;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;


import joshua.corpus.Corpus;
import joshua.corpus.Phrase;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.HierarchicalPhrase;
import joshua.corpus.suffix_array.Pattern;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.lm.NGramLanguageModel;
import joshua.decoder.ff.lm.buildin_lm.LMGrammarJAVA;


/**
 *
 *
 * @author Lane Schwartz
 */
public class LMAdaptingRuleExtractor extends HierarchicalRuleExtractor {


  final float[] weights;
  
  /**
     * Constructs a rule extractor for 
     * Hiero-style hierarchical phrase-based translation.
   * 
   * @param suffixArray        Suffix array representing the 
   *                           source language corpus
   * @param targetCorpus       Corpus array representing the
   *                           target language corpus
   * @param alignments         Represents alignments between words in the 
   *                           source corpus and the target corpus 
   * @param lexProbs           Lexical translation probability table
   * @param sampleSize         Specifies the maximum number of rules 
   *                           that will be extracted for any source pattern
   * @param maxPhraseSpan      Max span in the source corpus of any 
   *                           extracted hierarchical phrase
   * @param maxPhraseLength    Maximum number of terminals plus nonterminals
   *                           allowed in any extracted hierarchical phrase
   * @param minNonterminalSpan Minimum span in the source corpus of any 
   *                           nonterminal in an extracted hierarchical 
   *                           phrase
   * @param maxNonterminalSpan Maximum span in the source corpus of any 
   *                           nonterminal in an extracted hierarchical 
   *                           phrase
   * @throws IOException 
   */
  public LMAdaptingRuleExtractor(
      String largeArpaLM, String testArpaLM, int lmOrder,
      Suffixes suffixArray, 
      Suffixes targetSuffixArray, 
      Alignments alignments, 
      LexicalProbabilities lexProbs, 
      ArrayList<FeatureFunction> models,
      int sampleSize, 
      int maxPhraseSpan, 
      int maxPhraseLength, 
      int minNonterminalSpan, 
      int maxNonterminalSpan) throws IOException {
    
    super(suffixArray, 
        targetSuffixArray, alignments, 
        lexProbs, models, sampleSize, 
        maxPhraseSpan, maxPhraseLength, 
        minNonterminalSpan, maxNonterminalSpan);
    
    SymbolTable vocab = new Vocabulary();
    
    Corpus corpus = suffixArray.getCorpus();
    
    NGramLanguageModel largeLM = new LMGrammarJAVA(
        vocab,
        lmOrder,
        largeArpaLM,
        JoshuaConfiguration.use_left_equivalent_state,
        JoshuaConfiguration.use_right_equivalent_state);
    
    NGramLanguageModel testLM = new LMGrammarJAVA(
        vocab,
        lmOrder,
        testArpaLM,
        JoshuaConfiguration.use_left_equivalent_state,
        JoshuaConfiguration.use_right_equivalent_state);
    
    this.weights = new float[corpus.getNumSentences()];
    
    for (int i=0, n=corpus.getNumSentences(); i<n; i++) {
      Phrase sentence = corpus.getSentence(i);
      int[] words = sentence.getWordIDs();
      double largeProbLM = largeLM.ngramLogProbability(words);
      double testProbLM = testLM.ngramLogProbability(words);
      double ratio = testProbLM - largeProbLM;
      this.weights[i] = (float) ratio;
    }
  }
  
  @Override
  protected float[] calculateFeatureValues(Pattern sourcePattern, int sourcePatternCount, HierarchicalPhrase translation, Map<Pattern,Integer> counts, float totalTranslationCount) {
    float[] featureValues = super.calculateFeatureValues(sourcePattern, sourcePatternCount, translation, counts, totalTranslationCount);
    
    return featureValues;
  }
  
}
Source Code of joshua.prefix_tree.LMAdaptingRuleExtractor

Related Classes of joshua.prefix_tree.LMAdaptingRuleExtractor