/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.ff.tm.hiero;
import joshua.decoder.ff.tm.BatchGrammar;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.GrammarReader;
import joshua.decoder.ff.tm.Trie;
import joshua.corpus.vocab.SymbolTable;
import java.io.IOException;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* This class implements a memory-based bilingual BatchGrammar.
* <p>
* The rules are stored in a trie. Each trie node has:
* (1) RuleBin: a list of rules matching the french sides so far
* (2) A HashMap of next-layer trie nodes, the next french word
* used as the key in HashMap
*
* @author Zhifei Li, <zhifei.work@gmail.com>
* @version $LastChangedDate: 2010-01-08 11:00:40 -0600 (Fri, 08 Jan 2010) $
*/
public class MemoryBasedBatchGrammar extends BatchGrammar {
//===============================================================
// Instance Fields
//===============================================================
static private double temEstcost = 0.0;
private int qtyRulesRead = 0;
private int qtyRuleBins = 0;
private MemoryBasedTrie root = null;
//protected ArrayList<FeatureFunction> featureFunctions = null;
private int defaultOwner;
private float oovFeatureCost = 100;
/**
* the OOV rule should have this lhs, this should be grammar
* specific as only the grammar knows what LHS symbol can
* be combined with other rules
*/
private int defaultLHS;
private int spanLimit = 10;
private final SymbolTable symbolTable;
private GrammarReader<BilingualRule> modelReader;
//===============================================================
// Static Fields
//===============================================================
public static final int OOV_RULE_ID = 0;
/* Three kinds of rules:
* regular rule (id>0)
* oov rule (id=0)
* null rule (id=-1)
*/
static int ruleIDCount = 1;
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(MemoryBasedBatchGrammar.class.getName());
//===============================================================
// Constructors
//===============================================================
public MemoryBasedBatchGrammar() {
symbolTable = null;
}
public MemoryBasedBatchGrammar(
String formatKeyword,
String grammarFile,
SymbolTable symbolTable,
String defaultOwner,
String defaultLHSSymbol,
int spanLimit,
float oovFeatureCost_) throws IOException
{
this.symbolTable = symbolTable;
this.defaultOwner = this.symbolTable.addTerminal(defaultOwner);
this.defaultLHS = this.symbolTable.addNonterminal(defaultLHSSymbol);
this.spanLimit = spanLimit;
this.oovFeatureCost = oovFeatureCost_;
this.root = new MemoryBasedTrie();
//==== loading grammar
this.modelReader = createReader(formatKeyword, grammarFile, symbolTable);
if (modelReader != null) {
modelReader.initialize();
for (BilingualRule rule : modelReader)
if (rule != null)
addRule(rule);
} else {
if (logger.isLoggable(Level.WARNING))
logger.warning("Couldn't create a GrammarReader for file " + grammarFile + " with format " + formatKeyword);
}
this.printGrammar();
}
protected GrammarReader<BilingualRule> createReader(String formatKeyword,
String grammarFile, SymbolTable symbolTable){
if ("hiero".equals(formatKeyword)) {
return new HieroFormatReader(grammarFile, symbolTable);
} else if ("samt".equals(formatKeyword)) {
return new SamtFormatReader(grammarFile, symbolTable);
} else {
// TODO: throw something?
// TODO: add special warning if "heiro" mispelling is used
if (logger.isLoggable(Level.WARNING))
logger.warning("Unknown GrammarReader format " + formatKeyword);
return null;
}
}
//===============================================================
// Methods
//===============================================================
public int getNumRules() {
return this.qtyRulesRead;
}
public Rule constructOOVRule(int qtyFeatures, int sourceWord, int targetWord, boolean hasLM) {
int[] french = new int[1];
french[0] = sourceWord;
int[] english = new int[1];
english[0] = targetWord;
float[] feat_scores = new float[qtyFeatures];
// TODO: This is a hack to make the decoding without a LM works
/**when a ngram LM is used, the OOV word will have a cost 100.
* if no LM is used for decoding, so we should set the cost of some
* TM feature to be maximum
* */
if ( (!hasLM) && qtyFeatures > 0) {
feat_scores[0] = oovFeatureCost;
}
return new BilingualRule(this.defaultLHS, french, english, feat_scores, 0, this.defaultOwner, 0, getOOVRuleID());
}
public int getOOVRuleID() {
return OOV_RULE_ID;
}
public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity) {
return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.defaultOwner, 0, getOOVRuleID());
}
/**
* if the span covered by the chart bin is greater than the
* limit, then return false
*/
public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
if (this.spanLimit == -1) { // mono-glue grammar
return (startIndex == 0);
} else {
return (endIndex - startIndex <= this.spanLimit);
}
}
public Trie getTrieRoot() {
return this.root;
}
protected void addRule(BilingualRule rule) {
// TODO: Why two increments?
this.qtyRulesRead++;
ruleIDCount++;
rule.setRuleID(ruleIDCount);
rule.setOwner(defaultOwner);
// TODO: make sure costs are calculated here or in reader
temEstcost += rule.getEstCost();
//=== identify the position, and insert the trie nodes as necessary
MemoryBasedTrie pos = root;
int[] french = rule.getFrench();
for (int k = 0; k < french.length; k++) {
int curSymID = french[k];
/**Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth
* like [X,1]), but the symbol in the Trie has to be cleaned, so that the match does
* not care about the markup (i.e., [X,1] or [X,2] means the same thing, that is X)*/
if (this.symbolTable.isNonterminal(french[k])) {
curSymID = modelReader.cleanNonTerminal(french[k]);
}
MemoryBasedTrie nextLayer = pos.matchOne(curSymID);
if (null == nextLayer) {
nextLayer = new MemoryBasedTrie();
if (pos.hasExtensions() == false) {
pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
}
pos.childrenTbl.put(curSymID, nextLayer);
}
pos = nextLayer;
}
//=== add the rule into the trie node
if (! pos.hasRules()) {
pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
this.qtyRuleBins++;
}
pos.ruleBin.addRule(rule);
}
// BUG: This always prints 0 for all fields
protected void printGrammar() {
if (logger.isLoggable(Level.INFO)) {
logger.info("###########Grammar###########");
logger.info(String.format(
"####num_rules: %d; num_bins: %d; num_pruned: %d; sumest_cost: %.5f",
this.qtyRulesRead, this.qtyRuleBins, 0, temEstcost));
}
/*if(root!=null)
root.print_info(Support.DEBUG);*/
}
}