Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.SuffixArrayFactory

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import joshua.corpus.Corpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.alignment.AlignmentArray;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.vocab.ExternalizableSymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.io.LineReader;

import java.util.*;
import java.util.logging.Logger;
import java.io.*;


/**
* SuffixArrayFactory is the class that handles the loading and
* saving of SuffixArrays and their associated classes.
*
* @author Chris Callison-Burch
* @since  8 February 2005
* @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
*/

public class SuffixArrayFactory {

  //===============================================================
  // Constants
  //===============================================================

  public static boolean SHOW_PROGRESS = false;
  private static final Logger logger = Logger.getLogger(SuffixArrayFactory.class.getName());

  //===============================================================
  // Static
  //===============================================================

  /**
   * Gets the standard filename for a Vocabulary.
   */
  public static String getVocabularyFileName(String lang, String corpus) {
    return(lang+"_"+corpus+"_vocab.txt");
  }


  /**
   * Gets the standard filename for a CorpusArray.
   */
  public static String getCorpusArrayFileName(String lang, String corpus) {
    return (lang+"_"+corpus+"_sentences.txt");
  }


  /**
   * Gets the standard filename for a SuffixArray.
   */
  public static String getSuffixArrayFileName(String lang, String corpus) {
    return (lang+"_"+corpus+"_suffixes.txt");
  }


  /**
   * Gets the standard filename for an AlignmentArray.
   */
  public static String getAlignmentArrayFileName(String sourceLang, String targetLang, String corpus) {
    return (sourceLang+"_"+targetLang+"_"+corpus+"_alignment_array.txt");
  }


  // TODO: fuse createVocabulary and createCorpusArray together to avoid allocating the trivial array.
  public static CorpusArray createCorpusArray(String inputFilename)
  throws IOException {
    Vocabulary vocabulary = new Vocabulary();
    int[] ws = Vocabulary.initializeVocabulary(inputFilename, vocabulary, true);
    return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
  }


  // TODO: fuse count and createCorpusArray together to avoid allocating the trivial array.
  public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocabulary) throws IOException {
    int[] ws = count(inputFilename);
    return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
  }


  /**
   * Counts the words and sentences in a plain text file.
   *
   * @param inputFilename the plain text file
   * @return a tuple containing the number of words in the
   *         corpus and number of sentences in the corpus
   */
  static int[] count(String inputFilename) throws IOException {

    int numSentences = 0;
    int numWords = 0;

    LineReader lineReader = new LineReader(inputFilename);

    for (String line : lineReader) {
      String[] sentence = line.trim().split("\\s+");
      numWords += sentence.length;
      numSentences++;
      if(SHOW_PROGRESS && numSentences % 10000==0) logger.info(""+numWords);
    }

    int[] numberOfWordsSentences = { numWords, numSentences };
    return numberOfWordsSentences;
  }


  /**
   * Creates a new CorpusArray from a plain text file, given
   * a symbol table created from the same file.
   *
   * @param numWords     the number of words in the file
   *                     (returned by createVocabulary)
   * @param numSentences the number of lines in the file
   *                     (returned by createVocabulary)
   */
  public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocab, int numWords, int numSentences) throws IOException {
    // initialize the arrays.
    int[] corpus = new int[numWords];
    int[] sentenceIndexes = new int[numSentences];

    // instantiate them.
    int wordCounter = 0;
    int sentenceCounter = 0;

    LineReader lineReader = new LineReader(inputFilename);

    for (String phraseString : lineReader) {
      int[] words = vocab.getIDs(phraseString);
//      String[] wordStrings = phraseString.split("\\s+");
//      int[] words = new int[wordStrings.length];
//      for (int i = 0; i < wordStrings.length; i++) {
//        words[i] = vocab.getID(wordStrings[i]);
//      }
//
//      BasicPhrase sentence = new BasicPhrase(words, vocab);
      sentenceIndexes[sentenceCounter] = wordCounter;
      sentenceCounter++;
      System.arraycopy(words, 0, corpus, wordCounter, words.length);
      wordCounter += words.length;
//
//      for(int i = 0; i < sentence.size(); i++) {
//        corpus[wordCounter] = sentence.getWordID(i);
//        wordCounter++;
//      }
//      if(SHOW_PROGRESS && sentenceCounter % 10000==0) logger.info(""+numWords);
    }

    return new CorpusArray(corpus, sentenceIndexes, vocab);
  }


  /**
   * Creates a new SuffixArray from a CorpusArray created
   * from the same file.
   */
  public static SuffixArray createSuffixArray(Corpus corpusArray, int maxCacheSize) throws IOException {
    return new SuffixArray(corpusArray, maxCacheSize);
  }


  /**
   * Creates an Alignments object from a file containing
   * Moses-style alignments, and a source and target corpus.
   */
  public static Alignments createAlignments(String alignmentsFilename, Suffixes sourceCorpus, Suffixes targetCorpus) throws IOException {

    // Maps indices from the target corpus to a list of their aligned source points
    Map<Integer,List<Integer>> alignedSourceList = new HashMap<Integer,List<Integer>>();

    // Maps indices from the source corpus to a list of their aligned target points
    Map<Integer,List<Integer>> alignedTargetList = new HashMap<Integer,List<Integer>>();

    // Maps indices from the target corpus to an array of their aligned source points
    int[][] alignedSourceIndices = new int[targetCorpus.size()][];

    // Maps indices from the source corpus to an array of their aligned target points
    int[][] alignedTargetIndices = new int[sourceCorpus.size()][];

    // set the values of the arrays based on the alignments file...
    int sentenceCounter = 0;

    LineReader lineReader = new LineReader(alignmentsFilename);

    for (String line : lineReader) {

      // Start index of current source sentence
      int sourceOffset = sourceCorpus.getSentencePosition(sentenceCounter);

      // Start index of current target sentence
      int targetOffset = targetCorpus.getSentencePosition(sentenceCounter);

      // To save memory, clear old items that will not be used again
      alignedSourceList.clear();
      alignedTargetList.clear();

      // parse the alignment points
      String[] alignments = line.split("\\s+");
      for(int i = 0; i < alignments.length; i++) {
        String[] points = alignments[i].split("-");
        int sourceIndex = sourceOffset + Integer.parseInt(points[0]);
        int targetIndex = targetOffset + Integer.parseInt(points[1]);

        if (!alignedSourceList.containsKey(targetIndex)) {
          ArrayList<Integer> list = new ArrayList<Integer>();
          list.add(sourceIndex);
          alignedSourceList.put(targetIndex, list);
        } else {
          alignedSourceList.get(targetIndex).add(sourceIndex);
        }

        if (!alignedTargetList.containsKey(sourceIndex)) {
          ArrayList<Integer> list = new ArrayList<Integer>();
          list.add(targetIndex);
          alignedTargetList.put(sourceIndex, list);
        } else {
          alignedTargetList.get(sourceIndex).add(targetIndex);
        }

      }

      int nextSourceOffset = sourceCorpus.getSentencePosition(sentenceCounter+1);
      int nextTargetOffset = targetCorpus.getSentencePosition(sentenceCounter+1);

      for (int i=targetOffset; i<nextTargetOffset; i++) {

        if (alignedSourceList.containsKey(i)) {
          // List of source points aligned to the target index i
          List<Integer> sourceList = alignedSourceList.get(i);
          Collections.sort(sourceList);
          int size=sourceList.size();
          alignedSourceIndices[i] = new int[size];
          for (int j=0; j<size; j++) alignedSourceIndices[i][j] = sourceList.get(j);
        } else {
          alignedSourceIndices[i] = null;
        }

      }

      for (int i=sourceOffset; i<nextSourceOffset; i++) {

        if (alignedTargetList.containsKey(i)) {
          // List of target points aligned to the source index i
          List<Integer> targetList = alignedTargetList.get(i);
          Collections.sort(alignedTargetList.get(i));
          int size=targetList.size();
          alignedTargetIndices[i] = new int[size];
          for (int j=0; j<size; j++) alignedTargetIndices[i][j] = targetList.get(j);
        } else {
          alignedTargetIndices[i] = null;
        }
      }

      sentenceCounter++;
    }

    return new AlignmentArray(alignedTargetIndices, alignedSourceIndices, sentenceCounter);
  }



  //===============================================================
  // Private
  //===============================================================

}
TOP

Related Classes of joshua.corpus.suffix_array.SuffixArrayFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.