Package joshua.decoder.ff.tm.hiero

Source Code of joshua.decoder.ff.tm.hiero.MemoryBasedBatchGrammar

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.ff.tm.hiero;

import joshua.decoder.ff.tm.BatchGrammar;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.GrammarReader;
import joshua.decoder.ff.tm.Trie;
import joshua.corpus.vocab.SymbolTable;

import java.io.IOException;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* This class implements a memory-based bilingual BatchGrammar.
* <p>
* The rules are stored in a trie. Each trie node has:
* (1) RuleBin: a list of rules matching the french sides so far
* (2) A HashMap  of next-layer trie nodes, the next french word
*     used as the key in HashMap
*
* @author Zhifei Li, <zhifei.work@gmail.com>
* @version $LastChangedDate: 2010-01-08 11:00:40 -0600 (Fri, 08 Jan 2010) $
*/
public class MemoryBasedBatchGrammar extends BatchGrammar {
 
//===============================================================
// Instance Fields
//===============================================================
 
  static private double temEstcost = 0.0;
 
  private int qtyRulesRead = 0;
  private int qtyRuleBins  = 0;
  private MemoryBasedTrie root = null;
 
  //protected ArrayList<FeatureFunction> featureFunctions = null;
  private int defaultOwner;
 
  private float oovFeatureCost = 100;
 
  /**
   * the OOV rule should have this lhs, this should be grammar
   * specific as only the grammar knows what LHS symbol can
   * be combined with other rules
   */
  private int defaultLHS;
 
 
  private int spanLimit = 10;
  private final SymbolTable symbolTable;

  private GrammarReader<BilingualRule> modelReader;
 
//===============================================================
// Static Fields
//===============================================================

  public static final int OOV_RULE_ID = 0;

  /* Three kinds of rules:
   *     regular rule (id>0)
   *     oov rule (id=0)
   *     null rule (id=-1)
   */
 
  static int ruleIDCount = 1;
   
  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(MemoryBasedBatchGrammar.class.getName());

//===============================================================
// Constructors
//===============================================================

  public MemoryBasedBatchGrammar() {
    symbolTable = null;
  }
 
  public MemoryBasedBatchGrammar(
      String formatKeyword,
      String grammarFile,
      SymbolTable symbolTable,
      String defaultOwner,
      String defaultLHSSymbol,
      int spanLimit,
      float oovFeatureCost_) throws IOException
  {
   
    this.symbolTable  = symbolTable;
    this.defaultOwner = this.symbolTable.addTerminal(defaultOwner);
    this.defaultLHS   = this.symbolTable.addNonterminal(defaultLHSSymbol);
    this.spanLimit    = spanLimit;
    this.oovFeatureCost = oovFeatureCost_;
    this.root = new MemoryBasedTrie();
   
    //==== loading grammar
    this.modelReader = createReader(formatKeyword, grammarFile, symbolTable);
    if (modelReader != null) {
      modelReader.initialize();
      for (BilingualRule rule : modelReader)
        if (rule != null)
          addRule(rule);
    } else {
      if (logger.isLoggable(Level.WARNING))
        logger.warning("Couldn't create a GrammarReader for file " + grammarFile + " with format " + formatKeyword);
    }

    this.printGrammar();
  }
 
  protected GrammarReader<BilingualRule> createReader(String formatKeyword,
      String grammarFile, SymbolTable symbolTable){
   
    if ("hiero".equals(formatKeyword)) {
      return new HieroFormatReader(grammarFile, symbolTable);
    } else if ("samt".equals(formatKeyword)) {
      return new SamtFormatReader(grammarFile, symbolTable);
    } else {
      // TODO: throw something?
      // TODO: add special warning if "heiro" mispelling is used
     
      if (logger.isLoggable(Level.WARNING))
        logger.warning("Unknown GrammarReader format " + formatKeyword);
     
      return null;
    }
  }
 
 
//===============================================================
// Methods
//===============================================================

  public int getNumRules() {
    return this.qtyRulesRead;
  }


  public Rule constructOOVRule(int qtyFeatures, int sourceWord, int targetWord, boolean hasLM) {
    int[] french      = new int[1];
    french[0]         = sourceWord;
    int[] english       = new int[1];
    english[0]          = targetWord;
    float[] feat_scores = new float[qtyFeatures];
   
    // TODO: This is a hack to make the decoding without a LM works
    /**when a ngram LM is used, the OOV word will have a cost 100.
     * if no LM is used for decoding, so we should set the cost of some
     * TM feature to be maximum
     * */
    if ( (!hasLM) && qtyFeatures > 0) {
      feat_scores[0] = oovFeatureCost;
    }
   
    return new BilingualRule(this.defaultLHS, french, english, feat_scores, 0, this.defaultOwner, 0, getOOVRuleID());
  }

  public int getOOVRuleID() {
    return OOV_RULE_ID;
  }
 
 
  public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity) {
    return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.defaultOwner, 0, getOOVRuleID());
  }
 
 
 
 
  /**
   * if the span covered by the chart bin is greater than the
   * limit, then return false
   */
  public boolean hasRuleForSpan(int startIndex,  int endIndex,  int pathLength) {
    if (this.spanLimit == -1) { // mono-glue grammar
      return (startIndex == 0);
    } else {
      return (endIndex - startIndex <= this.spanLimit);
    }
  }
 
  public Trie getTrieRoot() {
    return this.root;
  }

  protected void addRule(BilingualRule rule) {
   
    // TODO: Why two increments?
    this.qtyRulesRead++;
    ruleIDCount++;

    rule.setRuleID(ruleIDCount);
    rule.setOwner(defaultOwner);
   
    // TODO: make sure costs are calculated here or in reader
    temEstcost += rule.getEstCost();
   
    //=== identify the position, and insert the trie nodes as necessary
    MemoryBasedTrie pos = root;
    int[] french = rule.getFrench();
    for (int k = 0; k < french.length; k++) {
      int curSymID = french[k];
     
      /**Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth
       * like [X,1]), but the symbol in the Trie has to be cleaned, so that the match does
       * not care about the markup (i.e., [X,1] or [X,2] means the same thing, that is X)*/
      if (this.symbolTable.isNonterminal(french[k])) {
        curSymID = modelReader.cleanNonTerminal(french[k]);
      }
     
      MemoryBasedTrie nextLayer = pos.matchOne(curSymID);
      if (null == nextLayer) {
        nextLayer = new MemoryBasedTrie();
        if (pos.hasExtensions() == false) {
          pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
        }
        pos.childrenTbl.put(curSymID, nextLayer);
      }
      pos = nextLayer;
    }
   
   
    //=== add the rule into the trie node
    if (! pos.hasRules()) {
      pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
      this.qtyRuleBins++;
    }
    pos.ruleBin.addRule(rule);
  }
 

 
  // BUG: This always prints 0 for all fields
  protected void printGrammar() {
    if (logger.isLoggable(Level.INFO)) {
      logger.info("###########Grammar###########");
      logger.info(String.format(
        "####num_rules: %d; num_bins: %d; num_pruned: %d; sumest_cost: %.5f",
        this.qtyRulesRead, this.qtyRuleBins, 0, temEstcost));
    }
    /*if(root!=null)
      root.print_info(Support.DEBUG);*/
  }

 
}
 
TOP

Related Classes of joshua.decoder.ff.tm.hiero.MemoryBasedBatchGrammar

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.