Package joshua.decoder.ff.lm.buildin_lm

Source Code of joshua.decoder.ff.lm.buildin_lm.LMGrammarJAVA$LMHash

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.ff.lm.buildin_lm;

import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.ff.lm.AbstractLM;
import joshua.decoder.ff.lm.LanguageModelFF;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Support;
import joshua.util.io.LineReader;
import joshua.util.Regex;

import java.io.IOException;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

// TODO: This class has a *huge* amount of redundant code. Eliminate it


/**
* this class implement
* (1) read the LM file into a Trie data structure
* (2) get LM probablity for a given n-grm
* (3) get equivilent state
*
* @author Zhifei Li, <zhifei.work@gmail.com>
* @version $LastChangedDate:2008-07-28 18:44:45 -0400 (Mon, 28 Jul 2008) $
*/
public class LMGrammarJAVA extends AbstractLM {

  // BUG: Why are the IDs not static? Why are the strings not final?
  static String BACKOFF_WGHT_SYM = "<bow>";
  int BACKOFF_WGHT_SYM_ID; // used by LMModel
 
  static String LM_HAVE_PREFIX_SYM = "<havelzfprefix>"; // to indicate that lm trie node has children
  int LM_HAVE_PREFIX_SYM_ID;
 
  static String UNK_SYM = "<unk>"; // unknown lm word
  int UNK_SYM_ID;
 
 
  /** Used for logging the time cost for things */
  private long start_loading_time;
 
  /*a backoff node is a hashtable, it may include:
   * (1) probabilititis for next words
   * (2) pointers to a next-layer backoff node (hashtable); the key lookup the value is: sym_id + highestID
   * (3) backoff weight for this node
   * (4) suffix/prefix flag to indicate that there is ngrams start from this suffix
   */
  private LMHash root = null;
  private int g_n_bow_nodes = 0;
  private int g_n_suffix_nodes = 0;
  static private float MIN_LOG_P = -9999.0f; //ngram prob must be smaller than this number
  static private double SUFFIX_ONLY = MIN_LOG_P*3; //ngram prob must be smaller than this number
 
  private double NON_EXIST_WEIGHT = 0; // the history has not appeared at all
  private int num_rule_read       = 0;
  boolean g_is_add_prefix_infor   = false;
  boolean g_is_add_suffix_infor   = false;
 
  HashMap<String, int[]> request_cache_prob        = new HashMap<String, int[]>();//cmd with result
  HashMap<String, int[]> request_cache_backoff     = new HashMap<String, int[]>();//cmd with result
  HashMap<String, int[]> request_cache_left_equiv  = new HashMap<String, int[]>();//cmd with result
  HashMap<String, int[]> request_cache_right_equiv = new HashMap<String, int[]>();//cmd with result
  int cache_size_limit= 250000;
 
 
  private static final Logger logger =
    Logger.getLogger(LMGrammarJAVA.class.getName());
 
  public LMGrammarJAVA(SymbolTable psymbol, int order, String lm_file, boolean is_add_suffix_infor, boolean is_add_prefix_infor) throws IOException {
    super(psymbol, order);
    logger.info("use java lm");
   
    this.BACKOFF_WGHT_SYM_ID   = psymbol.addTerminal(BACKOFF_WGHT_SYM);
    this.LM_HAVE_PREFIX_SYM_ID = psymbol.addTerminal(LM_HAVE_PREFIX_SYM);
    this.UNK_SYM_ID            = psymbol.addTerminal(UNK_SYM);
   
   
    g_is_add_prefix_infor = is_add_prefix_infor;
    g_is_add_suffix_infor = is_add_suffix_infor;
   
    read_lm_grammar_from_file(lm_file);//TODO: what about sentence-specific?
   
    //Symbol.add_global_symbols(true);
    /*//debug
    LMHash[] t_arrays = new LMHash[10000000];
    System.out.println("##### mem used (kb): " + Support.getMemoryUse());
    System.out.println("##### time used (seconds): " + (System.currentTimeMillis()-start_loading_time)/1000);
    for(int i=0; i<10000000;i++){
      LMHash t_h = new LMHash(5);
      double j=0.1f;
      t_h.put(i, j);
     
      //System.out.println("ele is " + t_h.get(i));
      t_arrays[i]=t_h;
      if(i%1000000==0){
        System.out.println(i +" ##### mem used (kb): " + Support.getMemoryUse());
        System.out.println("##### time used (seconds): " + (System.currentTimeMillis()-start_loading_time)/1000);
      }
    }
    System.exit(0);
    //end*/
 
   
    /*//debug
    double[] bow = new double[1];
    int[] backoff_history = new int[1];
    backoff_history[0]=Symbol.UNTRANS_SYM_ID;
    boolean finalized_backoff = check_backoff_weight(backoff_history, bow, 0);//backoff weight is already added outside this function?
   
    //System.out.println("bow_weigth id: " + Symbol.BACKOFF_WGHT_SYM_ID);
    System.out.println("is final: " + finalized_backoff);
    System.out.println("bow: " + bow[0]);
    System.exit(0);*/
  }
 
 
  //  signature of this item: i, j, lhs, states (in fact, we do not need i, j)
  private String get_signature(int[] words) {
    StringBuffer s = new StringBuffer(words.length);
    for (int i = 0; i < words.length; i++) {
      s.append(' ').append(words[i]);
    }
    return s.toString();
  }
 
 
 
  /*note: the mismatch between srilm and our java implemtation is in: when unk words used as context, in java it will be replaced with "<unk>", but srilm will not, therefore the
  *lm cost by srilm may be smaller than by java, this happens only when the LM file have "<unk>" in backoff state*/
  protected double ngramLogProbability_helper(int[] ngram, int order) {
    Double res;
    //cache
    //String sig = get_signature(ngram);
    //res = (Double)request_cache_prob.get(sig);
    //if(res!=null)return res;
   
    int[] ngram_wrds = replace_with_unk(ngram); // TODO
    if (ngram_wrds[ngram_wrds.length-1] == UNK_SYM_ID) { // TODO: wrong implementation in hiero
      res = -JoshuaConfiguration.lm_ceiling_cost;
    } else {
      //TODO: untranslated words
      if (null == root) {
        throw new RuntimeException("root is null");
      }
      int last_word_id = ngram_wrds[ngram_wrds.length-1];
      LMHash pos = root;
      Double prob = get_valid_prob(pos,last_word_id);
      double bow_sum = 0;
      // reverse search, start from the second-last word
      for (int i = ngram_wrds.length - 2; i >= 0; i--) {
        LMHash next_layer =
          (LMHash) pos.get(ngram_wrds[i] + this.symbolTable.getHighestID());
         
        if (null != next_layer) { // have context/bow node
          pos = next_layer;
          Double prob2 = get_valid_prob(pos,last_word_id);
          if (null != prob2) { // reset, if backoff, will at least back off to here
            prob    = prob2;
            bow_sum = 0;
          } else {
            Double bow = (Double) pos.get(BACKOFF_WGHT_SYM_ID);
            if (null != bow) {
              bow_sum += bow;
            }
          }
        } else { // do not have context/bow node
          break;
        }
      }
      res = prob + bow_sum;
    }
    //cache
    //if(request_cache_prob.size()>cache_size_limit)
    //  request_cache_prob.clear();
    //request_cache_prob.put(sig, res);
   
    return res;
  }
 
  private Double get_valid_prob(LMHash pos, int wrd) {
    Double res = (Double)pos.get(wrd);
    if (! g_is_add_suffix_infor) {
      return res;
    }
   
    if (null != res) {
      if (res == SUFFIX_ONLY) {
        return null;
      } else if (res > MIN_LOG_P) { // logP without suffix flag
        return res;
      } else { // logP with suffix flag
        return res - MIN_LOG_P;
      }
    }
    return null;
  }
 
//  ##################### begin right equivalent state #############
  //idea: from right to left, if a span does not have a backoff weight, which means all ngram having this span will backoff, and we can safely remove this state
  //the absence of backoff weight for low-order ngram implies the absence of higher-order ngram
  //the absence of backoff weight for low-order ngram implies the absence of backoff weight for high order ngram ????????????????
  /*e.g., if we do not have bow node for A, then we can say there is no bow nodes for
   * (1)*A: implied by the trie structure
   * (2)A*: if we have a BOW node for A* (with bow weight), due to the representantion of ARPA format, then we must have a probability for A*, which implies we have a BOW node for A
   * (3)*A*
   */
 
  //the returned array lenght must be the same the len of original_state
  //the only change to the original_state is: replace with more non-null state words to null state
  //O(n^2)
  public int[] rightEquivalentState(int[] original_state_in, int order) {
    if ( !JoshuaConfiguration.use_right_equivalent_state
    || original_state_in.length != ngramOrder - 1) {
      return original_state_in;
    }
    int[] res;
    //cache
    String sig = get_signature(original_state_in);
    res = (int[])request_cache_right_equiv.get(sig);
    if (null != res) {
      //System.out.println("right cache hit");
      return res;
    }
   
    // we do not put this statement at the beging to match the SRILM condition (who does not have replace_with_unk)
    int[] original_state = replace_with_unk(original_state_in);
   
    res = new int[original_state.length];
    for (int i = 1; i <= original_state.length; i++) { // forward search       
      int[] cur_wrds = Support.sub_int_array(original_state, i-1, original_state.length);
      if (! have_prefix(cur_wrds)) {
        res[i-1] = LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID;
      } else {
        for (int j = i; j <= original_state.length; j++) {
          res[j-1] = original_state[j-1];
        }
        break;
      }
    }
    //cache
    if (request_cache_right_equiv.size() > cache_size_limit) {
      request_cache_right_equiv.clear();
    }
    request_cache_right_equiv.put(sig, res);
   
    //System.out.println("right org state: " + Symbol.get_string(original_state) +"; equiv state: " + Symbol.get_string(res));
    return res;
  }
 
 
  //O(n)
  private boolean have_prefix(int[] words) {
    LMHash pos = root;
    int i = words.length - 1;
    for ( ; i >= 0; i--) { // reverse search
      LMHash next_layer =
        (LMHash) pos.get(words[i] + this.symbolTable.getHighestID());
      if (null != next_layer) {
        pos = next_layer;
      } else {
        break;
      }
    }
    return (i == -1 && pos.containsKey(LM_HAVE_PREFIX_SYM_ID));
  }
  
//    ##################### end right equivalent state #############
  

   //############################ begin left equivalent state ##############################

  
  /*several observation:
   * In general:
   * (1) In general, there might be more than one <bo> or <null>, and they can be in any position
   * (2) in general, whenever there is a <bo> or <null> in a given ngram, then it will definitely backoff since it has same/more context
  */   
  //return: (1) the equivlant state vector; (2) the finalized cost; (3) the estimated cost
  //  O(n^2)
  public int[] leftEquivalentState(int[] original_state_wrds_in, int order, double[] cost) {
    if (! JoshuaConfiguration.use_left_equivalent_state) {
      return original_state_wrds_in;
    }
   
    // we do not put this statement at the beging to match the SRILM condition (who does not have replace_with_unk)
    int[] original_state_wrds =
      replace_with_unk(original_state_wrds_in);
   
    //## deal with case overlap state
    if (original_state_wrds.length < ngramOrder - 1) {
      for (int i = 0; i < original_state_wrds.length; i++) {
        int[] currentWords = Support.sub_int_array(original_state_wrds, 0, i+1);
       
        // add estimated cost
        cost[1] += -ngramLogProbability(currentWords, currentWords.length);
      }
      return original_state_wrds;
    }
   
    //## non-overlaping state
    int[]  res_equi_state = new int[original_state_wrds.length];
    double res_final_cost = 0.0; // finalized cost
    double res_est_cost   = 0.0; // estimated cost
   
    BACKWORD_SEARCH:
    for (int i = original_state_wrds.length; i > 0; i--) {
      int[] cur_wrds =
        Support.sub_int_array(original_state_wrds, 0, i);
      if (! have_suffix(cur_wrds)) {
        int last_wrd = cur_wrds[i-1];
        if (last_wrd == UNK_SYM_ID) {
          res_equi_state[i-1] = last_wrd;
         
          // add estimated cost
          res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
        } else {
          if (last_wrd != LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID) {
            res_final_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
          }
         
          res_equi_state[i-1] =
            LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID;
         
          /*//TODO: for simplicity, we may just need BACKOFF_LEFT_LM_STATE_SYM_ID??
          int[] backoff_history = Support.sub_int_array(cur_wrds, 0, cur_wrds.length-1);//ignore last wrd
          double[] bow = new double[1];
          boolean finalized_backoff = check_backoff_weight(backoff_history, bow, 0);//backoff weight is already added outside this function?            
          if(finalized_backoff==true){
            res_equi_state[i-1]=Symbol.NULL_LEFT_LM_STATE_SYM_ID;//no state, no bow, no est_cost
          }else{
            res_equi_state[i-1]=Symbol.BACKOFF_LEFT_LM_STATE_SYM_ID;       
          }*/
        }
       
      } else { // we do have a suffix
        for (int j = i; j > 0; j--) {
          res_equi_state[j-1] = original_state_wrds[j-1];
          cur_wrds = Support.sub_int_array(original_state_wrds, 0, j);
         
          // Estimated cost
          res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
        }
        break BACKWORD_SEARCH;
      }
    }
   
    cost[0] = res_final_cost;
    cost[1] = res_est_cost;
    return res_equi_state;
  }
 
 
  private boolean have_suffix(int[] words) {
    LMHash pos = root;
    //reverse search, start from the second-last word
    for (int i = words.length-2; i >= 0; i--) {
      LMHash next_layer =
        (LMHash) pos.get(words[i] + this.symbolTable.getHighestID());
      if (null != next_layer) {
        pos = next_layer;
      } else {
        return false;
      }
    }
    Double prob = (Double)pos.get(words[words.length-1]);
    return (null != prob && prob <= MIN_LOG_P);
  }
 
 
  protected double logProbabilityOfBackoffState_helper(int[] ngram_wrds, int order, int n_additional_bow) {
    int[] backoff_wrds =
      Support.sub_int_array(ngram_wrds, 0, ngram_wrds.length - 1);
    double[] sum_bow = new double[1];
    check_backoff_weight(backoff_wrds, sum_bow, n_additional_bow);
    return sum_bow[0];
  }
 
 
  //if exist backoff weight for backoff_words, then return the accumated backoff weight
  //  if there is no backoff weight for backoff_words, then, we can return the finalized backoff weight
  private boolean check_backoff_weight(int[] backoff_words, double[] sum_bow, int num_backoff) {
    if (backoff_words.length <= 0) return false;
   
    double sum = 0;
    LMHash pos = root;
   
    //the start index that backoff should be applied
    int start_use_i = num_backoff - 1;
   
    Double bow = null;
    int i = backoff_words.length - 1;
    for(; i >= 0; i--) {
      LMHash next_layer = (LMHash) pos.get(
        backoff_words[i] + this.symbolTable.getHighestID());
     
      if (null != next_layer) {
        bow = (Double)next_layer.get(BACKOFF_WGHT_SYM_ID);
        if (null != bow && i <= start_use_i) {
          sum += bow;
        }
        pos = next_layer;
      } else {
        break;
      }
    }
    sum_bow[0] = sum;
   
    //the higest order have backoff weight, so we cannot finalize
    return (i != -1 || null == bow);
  }
//  ######################################## end left equiv state ###########################################


//  ######################################## general helper function ###########################################
  protected final int[] replace_with_unk(int[] in) {
    int[] res = new int[in.length];
    for (int i = 0; i < in.length; i++) {
      res[i] = replace_with_unk(in[i]);
    }
    return res;
  }
 
 
  protected int replace_with_unk(int in) {
    if (root.containsKey(in)
    || in == LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID
    || in == LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID
    ) {
      return in;
    } else {
      return UNK_SYM_ID;
    }
  }
 
 
//  ######################################## read LM grammar by the Java implementation ###########################################
 
  /*a backoff node is a hashtable, it may include:
   * (1) probability for next words: key id is positive
   * (2) pointer to a next-layer backoff node (hashtable): key id is negative!!!
   * (3) backoff weight for this node
   * (4) suffix flag to indicate that there is ngrams start from this suffix
   */
 
  //read grammar locally by the Java implementation
  private void read_lm_grammar_from_file(String grammar_file)
  throws IOException {
    start_loading_time = System.currentTimeMillis();
    root = new LMHash();
    root.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
   
    if (logger.isLoggable(Level.INFO))
      logger.info("Reading grammar from file " + grammar_file);
   
    boolean start = false;
    int order = 0;
   
    Regex blankLine  = new Regex("^\\s*$");
    Regex ngramsLine = new Regex("^\\\\\\d-grams:\\s*$");
   
    LineReader grammarReader = new LineReader(grammar_file);
    try { for (String line : grammarReader) {
      line = line.trim();
      if (blankLine.matches(line)) {
        continue;
      }
      if (ngramsLine.matches(line)) { // \1-grams:
        start = true;
        order = Integer.parseInt(line.substring(1, 2));
        if (order > ngramOrder) {
          break;
        }
        if (logger.isLoggable(Level.INFO))
          logger.info("begin to read ngrams with order " + order);
       
        continue; //skip this line
      }
      if (start) {
        add_rule(line,order, g_is_add_suffix_infor, g_is_add_prefix_infor);
      }
    } } finally { grammarReader.close(); }
   
    if (logger.isLoggable(Level.FINE)) {
      logger.fine("# of bow nodes: " + g_n_bow_nodes + " ; # of suffix nodes: " + g_n_suffix_nodes);
      logger.fine("add LMHash  " + g_n_bow_nodes);
      logger.fine("##### mem used (kb): " + Support.getMemoryUse());
      logger.fine("##### time used (seconds): "
        + (System.currentTimeMillis() - start_loading_time) / 1000);
    }
  }
 
 
  // format: prob \t ngram \t backoff-weight
  private void add_rule(String line, int order, boolean is_add_suffix_infor, boolean is_add_prefix_infor) {
    num_rule_read++;
    if (num_rule_read % 1000000 == 0) {
      if (logger.isLoggable(Level.FINE))
        logger.fine("read rules " + num_rule_read);
     
      //System.out.println("##### mem used (kb): " + Support.getMemoryUse());
      if (logger.isLoggable(Level.FINE))
        logger.fine("##### time used (seconds): "
          + (System.currentTimeMillis() - start_loading_time) / 1000);
    }
    String[] wrds = Regex.spaces.split(line.trim());
   
    if (wrds.length < order + 1 || wrds.length > order + 2) { // TODO: error
      //logger.severe("wrong line: "+ line);
      return;
    }
    int last_word_id = this.symbolTable.addTerminal(wrds[order]);
   
    //##### identify the BOW position, insert the backoff node if necessary, and add suffix information
    LMHash pos = root;
    // reverse search, start from the second-last word
    for (int i = order - 1; i > 0; i--) {
      if (is_add_suffix_infor) {
        Double t_prob = (Double) pos.get(last_word_id);
        if (null != t_prob) {
          if (t_prob > MIN_LOG_P) { // have prob, but not suffix flag
            double tem = t_prob + MIN_LOG_P;
            pos.put(last_word_id, tem); // overwrite
          }
        } else {
          pos.put(last_word_id, SUFFIX_ONLY);
        }
      }
      int cur_sym_id = this.symbolTable.addTerminal(wrds[i]);
      //System.out.println(this.symbolTable.getHighestID());
      LMHash next_layer =
        (LMHash) pos.get(cur_sym_id + this.symbolTable.getHighestID());
      if (null != next_layer) {
        pos = next_layer;
      } else {
        LMHash new_tnode = new LMHash(); // create new bow node
        pos.put(cur_sym_id + this.symbolTable.getHighestID(), new_tnode);
        pos = new_tnode;
       
        g_n_bow_nodes++;
        if (g_n_bow_nodes % 1000000 == 0) {
          if (logger.isLoggable(Level.FINE))
            logger.fine("add LMHash  " + g_n_bow_nodes);
         
          //System.out.println("##### mem used (kb): " + Support.getMemoryUse());
          if (logger.isLoggable(Level.FINE))
            logger.fine("##### time used (seconds): "
              + (System.currentTimeMillis() - start_loading_time) / 1000);
        }
      }
      if (! pos.containsKey(BACKOFF_WGHT_SYM_ID)) {
        //indicate it is a backoof node, to distinguish from a pure prefix node
        pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
      }
    }
   
    //##### add probability
    if (is_add_suffix_infor && pos.containsKey(last_word_id)) {
      double tem = Double.parseDouble(wrds[0]) + MIN_LOG_P;
      pos.put(last_word_id, tem); // add probability and suffix flag
    } else {
      // add probability
      pos.put(last_word_id, Double.parseDouble(wrds[0]));
    }
   
    //##### add prefix infor, a prefix node is just like a BOW node
    if (is_add_prefix_infor) {
      pos.put(LM_HAVE_PREFIX_SYM_ID, 1); // for preifx [1,order-1]
      for (int i = 1; i < order-1; i++) { // ignore the last prefix
        pos = root; // reset pos
        for (int j = i; j >= 1; j--) { // reverse search: [1,i]
          int cur_sym_id = this.symbolTable.addTerminal(wrds[j]);
          LMHash next_layer= (LMHash) pos.get(
            cur_sym_id + this.symbolTable.getHighestID());
         
          if (null != next_layer) {
            pos = next_layer;
          } else {
            LMHash new_tnode = new LMHash();//create new prefix node           
            pos.put(cur_sym_id + this.symbolTable.getHighestID(), new_tnode);
            pos = new_tnode;
           
            g_n_bow_nodes++;
            if (g_n_bow_nodes % 1000000 == 0) {
              if (logger.isLoggable(Level.FINE))
                logger.fine("add LMHash  " + g_n_bow_nodes);
             
              //System.out.println("##### mem used (kb): " + Support.getMemoryUse());
              if (logger.isLoggable(Level.FINE))
                logger.fine("##### time used (seconds): "
                  + (System.currentTimeMillis() - start_loading_time) / 1000);
            }
          }
        }
        pos.put( LM_HAVE_PREFIX_SYM_ID, 1);//only the last node should have this flag
      }
    }
   
   
    //##### add bow
    if (wrds.length == order+2) { // have bow weight to add
      pos = root;
      // reverse search, start from the last word
      for (int i = order; i >= 1; i--) {
        int cur_sym_id = this.symbolTable.addTerminal(wrds[i]);
        LMHash next_layer = (LMHash) pos.get(
          cur_sym_id + this.symbolTable.getHighestID());
        if (null != next_layer) {
          pos = next_layer;
        } else {
          LMHash new_tnode = new LMHash(); // create new bow node         
          pos.put(cur_sym_id + this.symbolTable.getHighestID(), new_tnode);
          pos = new_tnode;
         
          g_n_bow_nodes++;
          if (g_n_bow_nodes % 1000000 == 0) {
            if (logger.isLoggable(Level.FINE))
              logger.fine("add LMHash  " + g_n_bow_nodes);         
            //System.out.println("##### mem used (kb): " + Support.getMemoryUse());
            if (logger.isLoggable(Level.FINE))
              logger.fine("##### time used (seconds): "
                + (System.currentTimeMillis() - start_loading_time) / 1000);
          }
        }
       
        //add bow weight here
        if (i == 1) { // force to override the backoff weight
          double backoff_weight = Double.parseDouble(wrds[order+1]);
          pos.put(BACKOFF_WGHT_SYM_ID, backoff_weight);
        } else {
          if (! pos.containsKey(BACKOFF_WGHT_SYM_ID)) {
            //indicate it is a backoof node, to distinguish from a pure prefix node
            pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
          }
        }
      }
    }
  }
 
 
  /* ###################### not used
   private boolean have_suffix_old(int[] words){
     LMHash pos=root;
     int i=words.length-1;
     for(; i>=0; i--){//reverse search     
        LMHash next_layer=(LMHash) pos.get(words[i]+p_symbol.getLMEndID());
        if(next_layer!=null){
          pos=next_layer;         
        }else{
          break;
        }
     }
     if(i==-1 && pos.containsKey(Symbol.LM_HAVE_SUFFIX_SYM_ID))
       return true;
     else
       return false;
   }
   */
 
 
  //in theory: 64bytes (init size is 5)
  //in practice: 86 bytes (init size is 5)
  //in practice: 132 bytes (init size is 10)
  //in practice: 211 bytes (init size is 20)
  //important note: if we use tbl.put(key, new Integer(1)) instead of tbl.put(key, (new Integer(1)).intValue()), then for each element, we waste 16 bytes for the Integer object,
  //and the GC will not collect this Double object, because the hashtable ref it
  private static class LMHash //4bytes
  {
    //######note: key must be positive integer, and value must not be null
    /*if key can be both positive and negative, then lot of collision, or will take very long to call get()
     * imagine, we put all numbers in [1,20000] in hashtable, but try to call get() by numbers [-20000,-1], it will take very long time
     */
   
    //TODO: should round the array size to a prime number?
    static double load_factor = 0.6;
    static int default_init_size = 5;
   
    int size = 0; // 8 bytes?
    int[] key_array; // pointer itself is 4 bytes?, when it is newed, then add 10 more bytes, and the int itself
    Object[] val_array; // pointer itself is 4 bytes?, when it is newed, then add 10 more bytes, and the object itself
   
    public LMHash(int init_size) {
      key_array = new int[init_size];
      val_array = new Object[init_size];
    }
   
    public LMHash() {
      key_array = new int[default_init_size];
      val_array = new Object[default_init_size];
    }
   
    //return the positive position for the key
    private int hash_pos(int key, int length) {
      //return Math.abs(key % length);
      return key % length;
    }
   
    public Object get(int key) {
      Object res = null;
      int pos = hash_pos(key, key_array.length);
      while (key_array[pos] != 0) { // search until empty cell,
        if (key_array[pos] == key) {
          return val_array[pos]; // found
        }
        pos++; //linear search
        pos = hash_pos(pos, key_array.length);
      }
      return res;
    }
   
    public boolean containsKey(int key) {
      return (null != get(key));
    }
   
    public int size() {
      return size;
    }
   
    public void put(int key, Object value) {
      if (null == value) {
        throw new IllegalArgumentException("LMHash, value is null");
      }
     
      int pos = hash_pos(key, key_array.length);
      while (key_array[pos] != 0) { // search until empty cell,
        if (key_array[pos] == key) {
          val_array[pos] = value; // found, and overwrite
          return;
        }
        pos++; //linear search
        pos = hash_pos(pos, key_array.length);
      }
     
      //we get to here, means we do not have this key, need to insert it
      //data_array[pos] = new LMItem(key, value);
      key_array[pos] = key;
      val_array[pos] = value;
     
      size++;
      if (size >= key_array.length * load_factor) {
        expand_tbl();
      }
    }
   
   
    private void expand_tbl() {
      int new_size = key_array.length * 2 + 1; // TODO
      int[] new_key_array = new int[new_size];
      Object[] new_val_array = new Object[new_size];
     
      for (int i = 0; i < key_array.length; i++) {
        if (key_array[i] != 0) { // add the element
          int pos = hash_pos(key_array[i], new_key_array.length);
         
          // find first empty postition, note that it is not possible that we need to overwrite
          while (new_key_array[pos] != 0) {
            pos++; //linear search
            pos = hash_pos(pos, new_key_array.length);
          }
          new_key_array[pos] = key_array[i]
          new_val_array[pos] = val_array[i];
        }
      }
      key_array = new_key_array;
      val_array = new_val_array;
    }
  }
}
TOP

Related Classes of joshua.decoder.ff.lm.buildin_lm.LMGrammarJAVA$LMHash

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.