Source Code of joshua.corpus.suffix_array.SuffixArrayFactory

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus.suffix_array;


import joshua.corpus.Corpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.alignment.AlignmentArray;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.vocab.ExternalizableSymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.io.LineReader;


import java.util.*;
import java.util.logging.Logger;
import java.io.*;




/**
 * SuffixArrayFactory is the class that handles the loading and
 * saving of SuffixArrays and their associated classes.
 * 
 * @author Chris Callison-Burch
 * @since  8 February 2005
 * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
 */


public class SuffixArrayFactory {


  //===============================================================
  // Constants
  //===============================================================


  public static boolean SHOW_PROGRESS = false;
  private static final Logger logger = Logger.getLogger(SuffixArrayFactory.class.getName());


  //===============================================================
  // Static
  //===============================================================


  /**
   * Gets the standard filename for a Vocabulary.
   */
  public static String getVocabularyFileName(String lang, String corpus) {
    return(lang+"_"+corpus+"_vocab.txt"); 
  }




  /**
   * Gets the standard filename for a CorpusArray.
   */
  public static String getCorpusArrayFileName(String lang, String corpus) {
    return (lang+"_"+corpus+"_sentences.txt"); 
  }




  /**
   * Gets the standard filename for a SuffixArray.
   */
  public static String getSuffixArrayFileName(String lang, String corpus) {
    return (lang+"_"+corpus+"_suffixes.txt"); 
  }




  /**
   * Gets the standard filename for an AlignmentArray.
   */
  public static String getAlignmentArrayFileName(String sourceLang, String targetLang, String corpus) {
    return (sourceLang+"_"+targetLang+"_"+corpus+"_alignment_array.txt"); 
  }




  // TODO: fuse createVocabulary and createCorpusArray together to avoid allocating the trivial array.
  public static CorpusArray createCorpusArray(String inputFilename)
  throws IOException {
    Vocabulary vocabulary = new Vocabulary();
    int[] ws = Vocabulary.initializeVocabulary(inputFilename, vocabulary, true);
    return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
  }




  // TODO: fuse count and createCorpusArray together to avoid allocating the trivial array.
  public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocabulary) throws IOException {
    int[] ws = count(inputFilename);
    return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
  }




  /**
   * Counts the words and sentences in a plain text file.
   *
   * @param inputFilename the plain text file
   * @return a tuple containing the number of words in the
   *         corpus and number of sentences in the corpus
   */
  static int[] count(String inputFilename) throws IOException {


    int numSentences = 0;
    int numWords = 0;


    LineReader lineReader = new LineReader(inputFilename);


    for (String line : lineReader) {
      String[] sentence = line.trim().split("\\s+");
      numWords += sentence.length;
      numSentences++;
      if(SHOW_PROGRESS && numSentences % 10000==0) logger.info(""+numWords);
    }


    int[] numberOfWordsSentences = { numWords, numSentences };
    return numberOfWordsSentences;
  }




  /**
   * Creates a new CorpusArray from a plain text file, given
   * a symbol table created from the same file.
   *
   * @param numWords     the number of words in the file
   *                     (returned by createVocabulary)
   * @param numSentences the number of lines in the file
   *                     (returned by createVocabulary)
   */
  public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocab, int numWords, int numSentences) throws IOException {
    // initialize the arrays.
    int[] corpus = new int[numWords];
    int[] sentenceIndexes = new int[numSentences];


    // instantiate them.
    int wordCounter = 0;
    int sentenceCounter = 0;


    LineReader lineReader = new LineReader(inputFilename);


    for (String phraseString : lineReader) {
      int[] words = vocab.getIDs(phraseString);
//      String[] wordStrings = phraseString.split("\\s+");
//      int[] words = new int[wordStrings.length];
//      for (int i = 0; i < wordStrings.length; i++) {
//        words[i] = vocab.getID(wordStrings[i]);
//      }
//
//      BasicPhrase sentence = new BasicPhrase(words, vocab);
      sentenceIndexes[sentenceCounter] = wordCounter;
      sentenceCounter++;
      System.arraycopy(words, 0, corpus, wordCounter, words.length);
      wordCounter += words.length;
//
//      for(int i = 0; i < sentence.size(); i++) {
//        corpus[wordCounter] = sentence.getWordID(i);
//        wordCounter++;
//      }
//      if(SHOW_PROGRESS && sentenceCounter % 10000==0) logger.info(""+numWords);
    }


    return new CorpusArray(corpus, sentenceIndexes, vocab);
  }




  /**
   * Creates a new SuffixArray from a CorpusArray created
   * from the same file.
   */
  public static SuffixArray createSuffixArray(Corpus corpusArray, int maxCacheSize) throws IOException {
    return new SuffixArray(corpusArray, maxCacheSize);
  }




  /**
   * Creates an Alignments object from a file containing
   * Moses-style alignments, and a source and target corpus.
   */
  public static Alignments createAlignments(String alignmentsFilename, Suffixes sourceCorpus, Suffixes targetCorpus) throws IOException {


    // Maps indices from the target corpus to a list of their aligned source points
    Map<Integer,List<Integer>> alignedSourceList = new HashMap<Integer,List<Integer>>();


    // Maps indices from the source corpus to a list of their aligned target points
    Map<Integer,List<Integer>> alignedTargetList = new HashMap<Integer,List<Integer>>();


    // Maps indices from the target corpus to an array of their aligned source points
    int[][] alignedSourceIndices = new int[targetCorpus.size()][];


    // Maps indices from the source corpus to an array of their aligned target points
    int[][] alignedTargetIndices = new int[sourceCorpus.size()][];


    // set the values of the arrays based on the alignments file...
    int sentenceCounter = 0;


    LineReader lineReader = new LineReader(alignmentsFilename);


    for (String line : lineReader) {


      // Start index of current source sentence
      int sourceOffset = sourceCorpus.getSentencePosition(sentenceCounter);


      // Start index of current target sentence
      int targetOffset = targetCorpus.getSentencePosition(sentenceCounter);


      // To save memory, clear old items that will not be used again
      alignedSourceList.clear();
      alignedTargetList.clear();


      // parse the alignment points
      String[] alignments = line.split("\\s+");
      for(int i = 0; i < alignments.length; i++) {
        String[] points = alignments[i].split("-");
        int sourceIndex = sourceOffset + Integer.parseInt(points[0]);
        int targetIndex = targetOffset + Integer.parseInt(points[1]);


        if (!alignedSourceList.containsKey(targetIndex)) {
          ArrayList<Integer> list = new ArrayList<Integer>();
          list.add(sourceIndex);
          alignedSourceList.put(targetIndex, list);
        } else {
          alignedSourceList.get(targetIndex).add(sourceIndex);
        }


        if (!alignedTargetList.containsKey(sourceIndex)) {
          ArrayList<Integer> list = new ArrayList<Integer>();
          list.add(targetIndex);
          alignedTargetList.put(sourceIndex, list);
        } else {
          alignedTargetList.get(sourceIndex).add(targetIndex);
        }


      }


      int nextSourceOffset = sourceCorpus.getSentencePosition(sentenceCounter+1);
      int nextTargetOffset = targetCorpus.getSentencePosition(sentenceCounter+1);


      for (int i=targetOffset; i<nextTargetOffset; i++) {


        if (alignedSourceList.containsKey(i)) {
          // List of source points aligned to the target index i
          List<Integer> sourceList = alignedSourceList.get(i);
          Collections.sort(sourceList);
          int size=sourceList.size();
          alignedSourceIndices[i] = new int[size];
          for (int j=0; j<size; j++) alignedSourceIndices[i][j] = sourceList.get(j);
        } else {
          alignedSourceIndices[i] = null;
        }


      }


      for (int i=sourceOffset; i<nextSourceOffset; i++) {


        if (alignedTargetList.containsKey(i)) {
          // List of target points aligned to the source index i
          List<Integer> targetList = alignedTargetList.get(i);
          Collections.sort(alignedTargetList.get(i));
          int size=targetList.size();
          alignedTargetIndices[i] = new int[size];
          for (int j=0; j<size; j++) alignedTargetIndices[i][j] = targetList.get(j);
        } else {
          alignedTargetIndices[i] = null;
        }
      }


      sentenceCounter++;
    }


    return new AlignmentArray(alignedTargetIndices, alignedSourceIndices, sentenceCounter);
  }






  //===============================================================
  // Private 
  //===============================================================


}
Source Code of joshua.corpus.suffix_array.SuffixArrayFactory

Related Classes of joshua.corpus.suffix_array.SuffixArrayFactory