Package joshua.decoder.ff.lm

Source Code of joshua.decoder.ff.lm.ArpaFile

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.decoder.ff.lm;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.Regex;
import joshua.util.io.LineReader;

/**
* Utility class for reading ARPA language model files.
*
* @author Lane Schwartz
*/
public class ArpaFile implements Iterable<ArpaNgram> {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(ArpaFile.class.getName());
 
  /** Regular expression representing a blank line. */
  public static final Regex BLANK_LINE  = new Regex("^\\s*$");
 
  /**
   * Regular expression representing a line
   * starting a new section of n-grams in an ARPA language model file.
   */
  public static final Regex NGRAM_HEADER = new Regex("^\\\\\\d-grams:\\s*$");
 
  /**
   * Regular expression representing a line
   * ending an ARPA language model file.
   */
  public static final Regex NGRAM_END = new Regex("^\\\\end\\\\s*$");
 
  /** ARPA file for this object. */
  private final File arpaFile;
 
  /** The symbol table associated with this object. */
  private final SymbolTable vocab;
 
  /**
   * Constructs an object that represents an ARPA language model file.
   *
   * @param arpaFileName File name of an ARPA language model file
   * @param vocab Symbol table to be used by this object
   */
  public ArpaFile(String arpaFileName, SymbolTable vocab) {
    this.arpaFile = new File(arpaFileName);
    this.vocab = vocab;
  }

  public ArpaFile(String arpaFileName) throws IOException {
    this.arpaFile = new File(arpaFileName);
    this.vocab = new Vocabulary();
   
//    final Scanner scanner = new Scanner(arpaFile);
   
//    // Eat initial header lines
//    while (scanner.hasNextLine()) {
//      String line = scanner.nextLine();
//      logger.finest("Discarding line: " + line);
//      if (NGRAM_HEADER.matches(line)) {
//        break;
//      }
//    }
   
//    int ngramOrder = 1;
   
    LineReader grammarReader = new LineReader(arpaFileName);
   
    try {
      for (String line : grammarReader) {


//    while (scanner.hasNext()) {
//     
//      String line = scanner.nextLine();

        String[] parts = Regex.spaces.split(line);
        if (parts.length > 1) {
          String[] words = Regex.spaces.split(parts[1]);

          for (String word : words) {
            if (logger.isLoggable(Level.FINE)) logger.fine("Adding to vocab: " + word);
            vocab.addTerminal(word)
          }

        } else {
          logger.info(line);
        }

      }
    } finally {
      grammarReader.close();
    }

//     
//      boolean lineIsHeader = NGRAM_HEADER.matches(line);
//     
//      while (lineIsHeader || BLANK_LINE.matches(line)) {
//       
//        if (lineIsHeader) {
//          ngramOrder++;
//        }
//       
//        if (scanner.hasNext()) {
//          line = scanner.nextLine().trim();
//          lineIsHeader = NGRAM_HEADER.matches(line);
//        } else {
//          logger.severe("Ran out of lines!");
//          return;
//        }
//      }
     
     
//     
//      // Add word to vocab
//      if (logger.isLoggable(Level.FINE)) logger.fine("Adding word to vocab: " + parts[ngramOrder]);
//      vocab.addTerminal(parts[ngramOrder]);
//     
//      // Add context words to vocab
//      for (int i=1; i<ngramOrder; i++) {
//        if (logger.isLoggable(Level.FINE)) logger.fine("Adding context word to vocab: " + parts[i]);
//        vocab.addTerminal(parts[i]);
//      }
     
//    }
   
    logger.info("Done constructing ArpaFile");
   
  }
 
  /**
   * Gets the symbol table associated with this object.
   *
   * @return the symbol table associated with this object
   */
  public SymbolTable getVocab() {
    return vocab;
  }
 
  /**
   * Gets the total number of n-grams
   * in this ARPA language model file.
   *
   * @return total number of n-grams
   *         in this ARPA language model file
   */
  @SuppressWarnings("unused")
  public int size() {

    logger.fine("Counting n-grams in ARPA file");
    int count=0;
   
    for (ArpaNgram ngram : this) {
      count++;
    }
    logger.fine("Done counting n-grams in ARPA file");
   
    return count;
  }
 
  public int getOrder() throws FileNotFoundException {

    Pattern pattern = Pattern.compile("^ngram (\\d+)=\\d+$");
    if (logger.isLoggable(Level.FINEST)) logger.finest("Pattern is " + pattern.toString());
    final Scanner scanner = new Scanner(arpaFile);

    int order = 0;
   
    // Eat initial header lines
    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
     
      if (NGRAM_HEADER.matches(line)) {
        break;
      } else {
        Matcher matcher = pattern.matcher(line);
        if (matcher.matches()) {
          if (logger.isLoggable(Level.FINEST)) logger.finest("DOES   match: \'" + line + "\'");
          order = Integer.valueOf(matcher.group(1));
        } else if (logger.isLoggable(Level.FINEST)) {
          logger.finest("Doesn't match: \'" + line + "\'");
        }
      }
    }
   
    return order;
  }
 
  /**
   * Gets an iterator capable of iterating
   * over all n-grams in the ARPA file.
   *
   * @return an iterator capable of iterating
   *         over all n-grams in the ARPA file
   */
  public Iterator<ArpaNgram> iterator() {

    try {
      final Scanner scanner;
     
      if (arpaFile.getName().endsWith("gz")) {
        InputStream in = new GZIPInputStream(
            new FileInputStream(arpaFile));
        scanner = new Scanner(in);
      } else {
        scanner = new Scanner(arpaFile);
      }
     
      // Eat initial header lines
      while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        logger.finest("Discarding line: " + line);
        if (NGRAM_HEADER.matches(line)) {
          break;
        }
      }
     
      return new Iterator<ArpaNgram>() {
       
        String nextLine = null;
        int ngramOrder = 1;
//        int id = 0;
       
        public boolean hasNext() {
         
          if (scanner.hasNext()) {
           
            String line = scanner.nextLine();
           
            boolean lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
           
            while (lineIsHeader || BLANK_LINE.matches(line)) {
             
              if (lineIsHeader) {
                ngramOrder++;
              }
             
              if (scanner.hasNext()) {
                line = scanner.nextLine().trim();
                lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
              } else {
                nextLine = null;
                return false;
              }
            }
           
            nextLine = line;
            return true;
           
          } else {
            nextLine = null;
            return false;
          }
         
        }

        public ArpaNgram next() {
          if (nextLine!=null) {
           
            String[] parts = Regex.spaces.split(nextLine);

            float value = Float.valueOf(parts[0]);
           
            int word = vocab.getID(parts[ngramOrder]);
           
            int[] context = new int[ngramOrder-1];
            for (int i=1; i<ngramOrder; i++) {
              context[i-1] = vocab.getID(parts[i]);
            }
           
            float backoff;
            if (parts.length > ngramOrder+1) {
              backoff = Float.valueOf(parts[parts.length-1]);
            } else {
              backoff = ArpaNgram.DEFAULT_BACKOFF;
            }
           
            nextLine = null;
            return new ArpaNgram(word, context, value, backoff);
           
          } else {
            throw new NoSuchElementException();
          }
        }

        public void remove() {
          throw new UnsupportedOperationException();
        }
       
      };
    } catch (FileNotFoundException e) {
      logger.severe(e.toString());
      return null;
    } catch (IOException e) {
      logger.severe(e.toString());
      return null;
    }
   
  }
}
TOP

Related Classes of joshua.decoder.ff.lm.ArpaFile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.