Source Code of joshua.decoder.ff.lm.ArpaFile

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.decoder.ff.lm;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;


import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.Regex;
import joshua.util.io.LineReader;


/**
 * Utility class for reading ARPA language model files.
 * 
 * @author Lane Schwartz
 */
public class ArpaFile implements Iterable<ArpaNgram> {


  /** Logger for this class. */
  private static final Logger logger = 
    Logger.getLogger(ArpaFile.class.getName());
  
  /** Regular expression representing a blank line. */
  public static final Regex BLANK_LINE  = new Regex("^\\s*$");
  
  /** 
   * Regular expression representing a line 
   * starting a new section of n-grams in an ARPA language model file. 
   */
  public static final Regex NGRAM_HEADER = new Regex("^\\\\\\d-grams:\\s*$");
  
  /** 
   * Regular expression representing a line 
   * ending an ARPA language model file. 
   */
  public static final Regex NGRAM_END = new Regex("^\\\\end\\\\s*$");
  
  /** ARPA file for this object. */
  private final File arpaFile;
  
  /** The symbol table associated with this object. */
  private final SymbolTable vocab;
  
  /**
   * Constructs an object that represents an ARPA language model file.
   * 
   * @param arpaFileName File name of an ARPA language model file
   * @param vocab Symbol table to be used by this object
   */
  public ArpaFile(String arpaFileName, SymbolTable vocab) {
    this.arpaFile = new File(arpaFileName);
    this.vocab = vocab;
  }


  public ArpaFile(String arpaFileName) throws IOException {
    this.arpaFile = new File(arpaFileName);
    this.vocab = new Vocabulary();
    
//    final Scanner scanner = new Scanner(arpaFile);
    
//    // Eat initial header lines
//    while (scanner.hasNextLine()) {
//      String line = scanner.nextLine();
//      logger.finest("Discarding line: " + line);
//      if (NGRAM_HEADER.matches(line)) {
//        break;
//      }
//    }
    
//    int ngramOrder = 1;
    
    LineReader grammarReader = new LineReader(arpaFileName);
    
    try {
      for (String line : grammarReader) {




//    while (scanner.hasNext()) {
//      
//      String line = scanner.nextLine();


        String[] parts = Regex.spaces.split(line);
        if (parts.length > 1) {
          String[] words = Regex.spaces.split(parts[1]);


          for (String word : words) {
            if (logger.isLoggable(Level.FINE)) logger.fine("Adding to vocab: " + word);
            vocab.addTerminal(word);  
          }


        } else {
          logger.info(line);
        }


      }
    } finally { 
      grammarReader.close(); 
    }


//      
//      boolean lineIsHeader = NGRAM_HEADER.matches(line);
//      
//      while (lineIsHeader || BLANK_LINE.matches(line)) {
//        
//        if (lineIsHeader) {
//          ngramOrder++;
//        }
//        
//        if (scanner.hasNext()) {
//          line = scanner.nextLine().trim();
//          lineIsHeader = NGRAM_HEADER.matches(line);
//        } else {
//          logger.severe("Ran out of lines!");
//          return;
//        }
//      }
      
      
//      
//      // Add word to vocab
//      if (logger.isLoggable(Level.FINE)) logger.fine("Adding word to vocab: " + parts[ngramOrder]);
//      vocab.addTerminal(parts[ngramOrder]);
//      
//      // Add context words to vocab
//      for (int i=1; i<ngramOrder; i++) {
//        if (logger.isLoggable(Level.FINE)) logger.fine("Adding context word to vocab: " + parts[i]);
//        vocab.addTerminal(parts[i]);
//      }
      
//    }
    
    logger.info("Done constructing ArpaFile");
    
  }
  
  /**
   * Gets the symbol table associated with this object.
   * 
   * @return the symbol table associated with this object
   */
  public SymbolTable getVocab() {
    return vocab;
  }
  
  /**
   * Gets the total number of n-grams 
   * in this ARPA language model file.
   * 
   * @return total number of n-grams 
   *         in this ARPA language model file
   */
  @SuppressWarnings("unused")
  public int size() {


    logger.fine("Counting n-grams in ARPA file");
    int count=0;
    
    for (ArpaNgram ngram : this) {
      count++;
    }
    logger.fine("Done counting n-grams in ARPA file");
    
    return count;
  }
  
  public int getOrder() throws FileNotFoundException {


    Pattern pattern = Pattern.compile("^ngram (\\d+)=\\d+$");
    if (logger.isLoggable(Level.FINEST)) logger.finest("Pattern is " + pattern.toString());
    final Scanner scanner = new Scanner(arpaFile);


    int order = 0;
    
    // Eat initial header lines
    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
      
      if (NGRAM_HEADER.matches(line)) {
        break;
      } else {
        Matcher matcher = pattern.matcher(line);
        if (matcher.matches()) {
          if (logger.isLoggable(Level.FINEST)) logger.finest("DOES   match: \'" + line + "\'");
          order = Integer.valueOf(matcher.group(1));
        } else if (logger.isLoggable(Level.FINEST)) {
          logger.finest("Doesn't match: \'" + line + "\'");
        }
      }
    }
    
    return order;
  }
  
  /**
   * Gets an iterator capable of iterating 
   * over all n-grams in the ARPA file.
   * 
   * @return an iterator capable of iterating 
   *         over all n-grams in the ARPA file
   */
  public Iterator<ArpaNgram> iterator() {


    try {
      final Scanner scanner;
      
      if (arpaFile.getName().endsWith("gz")) {
        InputStream in = new GZIPInputStream(
            new FileInputStream(arpaFile));
        scanner = new Scanner(in);
      } else {
        scanner = new Scanner(arpaFile);
      }
      
      // Eat initial header lines
      while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        logger.finest("Discarding line: " + line);
        if (NGRAM_HEADER.matches(line)) {
          break;
        }
      }
      
      return new Iterator<ArpaNgram>() {
        
        String nextLine = null;
        int ngramOrder = 1;
//        int id = 0;
        
        public boolean hasNext() {
          
          if (scanner.hasNext()) {
            
            String line = scanner.nextLine();
            
            boolean lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
            
            while (lineIsHeader || BLANK_LINE.matches(line)) {
              
              if (lineIsHeader) {
                ngramOrder++;
              }
              
              if (scanner.hasNext()) {
                line = scanner.nextLine().trim();
                lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
              } else {
                nextLine = null;
                return false;
              }
            }
            
            nextLine = line;
            return true;
            
          } else {
            nextLine = null;
            return false;
          }
          
        }


        public ArpaNgram next() {
          if (nextLine!=null) {
            
            String[] parts = Regex.spaces.split(nextLine);


            float value = Float.valueOf(parts[0]);
            
            int word = vocab.getID(parts[ngramOrder]);
            
            int[] context = new int[ngramOrder-1];
            for (int i=1; i<ngramOrder; i++) {
              context[i-1] = vocab.getID(parts[i]);
            }
            
            float backoff;
            if (parts.length > ngramOrder+1) {
              backoff = Float.valueOf(parts[parts.length-1]);
            } else {
              backoff = ArpaNgram.DEFAULT_BACKOFF;
            }
            
            nextLine = null;
            return new ArpaNgram(word, context, value, backoff);
            
          } else {
            throw new NoSuchElementException();
          }
        }


        public void remove() {
          throw new UnsupportedOperationException();
        }
        
      };
    } catch (FileNotFoundException e) {
      logger.severe(e.toString());
      return null;
    } catch (IOException e) {
      logger.severe(e.toString());
      return null;
    }
    
  }
}
Source Code of joshua.decoder.ff.lm.ArpaFile

Related Classes of joshua.decoder.ff.lm.ArpaFile