Package joshua.corpus.vocab

Source Code of joshua.corpus.vocab.DefaultSymbol

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.vocab;

import joshua.util.io.LineReader;
import joshua.util.Regex;

import java.io.IOException;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Logger;

/**
* this class implement
* (1) initialize the symbol table
* (2) provide conversion between symbol and integers
*
* How to initialize the Symbol Having multiple LM modes complicate
* the class, we have four LM modes: JAVA_LM, SRILM, Distributed_LM,
* and NONE_LM. The NONE_LM and JAVA_LM will be treated as same.
*
* JAVA_LM and NONE_LM: call add_global_symbols(true) to initialize
* SRILM: the SRILM must first be initialized, then call add_global_symbols(false)
* DistributedLM (from decoder): call init_sym_tbl_from_file(true)
* DistributedLM (from LMServer): call init_sym_tbl_from_file(true/false)
*
* @author Zhifei Li, <zhifei.work@gmail.com>
* @version $LastChangedDate: 2010-01-20 09:33:41 -0600 (Wed, 20 Jan 2010) $
*/
public abstract class DefaultSymbol extends AbstractSymbolTable {
 
  public boolean isReadingFromFile = false;
 
  protected int lmStartSymID = 10000; // 1..10000 reserved for special purpose
 
  //@todo: change this to a very large value such as 500000001 or a very small value such as 50001 seems slow down the JavaLM significatnlty
  protected int lmEndSymID = 5000001; // max vocab 1000k
 
  // terminal symbol may get from a tbl file, srilm, or a lm file
  //**non-terminal symbol is always from myself, and the integer should always be negative 
  private HashMap<String,Integer> string2id = new HashMap<String,Integer>();
  private HashMap<Integer,String> id2string = new HashMap<Integer,String>();
  private int nonterminalCurrentId = -1;
 
  private static final Logger logger =
    Logger.getLogger(DefaultSymbol.class.getName());
 
 
  public DefaultSymbol() {
    // do nothing here, because we want the sub-class doing specific things
  }
 
 
  //protected abstract String getTerminalWord(int id);
 
 
  final public String getWord(int id) {
    if (isNonterminal(id)) {
      return getNonterminal(id);
    } else {
      return getTerminal(id);
    }
  }
 
 
  final public int getLowestID() {
    return this.lmStartSymID;
  }
 
 
  final public int getHighestID() {
    return this.lmEndSymID;
  }
 
 
  final public String getNonterminal(int id) {
    String res = this.id2string.get(id);
    if (null == res) {
      throw new RuntimeException("try to query the string for non exist id " + id + ", must exit");
    }
    return res;
  }
 
 
  final public String getWords(Integer[] ids) {
    StringBuffer sb = new StringBuffer();
   
    if (ids.length > 0) sb.append(this.getWord(ids[0]));
    for (int i = 1; i < ids.length; i++) {
      sb.append(' ').append(this.getWord(ids[i]));
    }
    return sb.toString();
  }
 
  final public String getWords(int[] ids) {
    StringBuffer sb = new StringBuffer();
   
    if (ids.length > 0) sb.append(this.getWord(ids[0]));
    for (int i = 1; i < ids.length; i++) {
      sb.append(' ').append(this.getWord(ids[i]));
    }
    return sb.toString();
  }
 
  final public String getWords(List<Integer> ids) {
    StringBuffer      sb = new StringBuffer();
    Iterator<Integer> it = ids.iterator();
   
    if (it.hasNext()) sb.append(this.getWord(it.next()));
    while (it.hasNext()) sb.append(' ').append(this.getWord(it.next()));
    return sb.toString();
  }
 
 
   
//  ####### following functions used for TM only
  final public int addNonterminal(String str) {
    Integer id = this.string2id.get(str);
    if (null != id) { // already have this symbol
      if (! isNonterminal(id)) {
        throw new RuntimeException("NONTSym: " + str + "; id: " + id);
      }
      return id;
    } else {
      string2id.put(str, nonterminalCurrentId);
      id2string.put(nonterminalCurrentId, str);
      nonterminalCurrentId--;
      return (nonterminalCurrentId + 1);
    }
  }
 
 
  final public boolean isNonterminal(int id) {
    return (id < 0);
  }
 
 
  protected void initializeSymTblFromFile(String fname)
  throws IOException {
    this.isReadingFromFile = true;
    //### read file into tbls
    HashMap<String,Integer> localStr2id = new HashMap<String,Integer>();
    HashMap<Integer,String> localId2str = new HashMap<Integer,String>();
   
    LineReader symboltableReader = new LineReader(fname);
    try { for (String line : symboltableReader) {
      String[] fds = Regex.spaces.split(line);
      if (2 != fds.length) {
        logger.warning("read index, bad line: " + line);
        continue;
      }
      String str = fds[0].trim();
      int id = Integer.parseInt(fds[1]);
     
      String uniqueStr;
      if (null != localStr2id.get(str)) { // it is quite possible that java will treat two stings as the same when other language (e.g., C or perl) treat them differently, due to unprintable symbols
        logger.warning("duplicate string (add fake): " + line);
        uniqueStr = str + id;//fake string
        //System.exit(1);//TODO
      } else {
        uniqueStr = str;
      }
      localStr2id.put(uniqueStr, id);
     
      //it is guaranteed that the strings in localId2str are different
      if (null != localId2str.get(id)) {
        throw new RuntimeException("duplicate id, have to exit; " + line);
      } else {
        localId2str.put(id, uniqueStr);
      }
    } } finally { symboltableReader.close(); }
   
    /*if (localId2str.size() >= this.lm_end_sym_id - this.lm_start_sym_id) {
      throw new RuntimeException("read symbol tbl, tlb is too big");
    }*/
   
    //#### now add the tbl into srilm/java-tbl
    int n_added = 0;
    for (int i = this.lmStartSymID; i < this.lmEndSymID; i++) {
      // it is guranteed that the strings in localId2str are different
      String str = localId2str.get(i);
      int id;
      if (null != str) {
        id = this.addTerminal(str);
        n_added++;
      } else { // non-continous index
        logger.warning("added fake symbol, be alert");
        id = this.addTerminal("lzf" + i);
      }
      if (id != i) {
        throw new RuntimeException("id supposed: " + i + " != assigned " + id + " symbol:" + str);
      }
      if (n_added >= localId2str.size()) {
        break;
      }
    }
  }
 
 
  public int[] getIDs(String sentence) {
    return this.addTerminals(sentence);
  }

  public String getTerminals(int[] wordIDs) {
    return this.getWords(wordIDs);
  }

  public List<String> getWords() {
    return new ArrayList<String>(id2string.values());
  }

  public int size() {
    return this.id2string.size();
  }
}
TOP

Related Classes of joshua.corpus.vocab.DefaultSymbol

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.