/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import joshua.corpus.Corpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.alignment.AlignmentArray;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.vocab.ExternalizableSymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.io.LineReader;
import java.util.*;
import java.util.logging.Logger;
import java.io.*;
/**
* SuffixArrayFactory is the class that handles the loading and
* saving of SuffixArrays and their associated classes.
*
* @author Chris Callison-Burch
* @since 8 February 2005
* @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
*/
public class SuffixArrayFactory {
//===============================================================
// Constants
//===============================================================
public static boolean SHOW_PROGRESS = false;
private static final Logger logger = Logger.getLogger(SuffixArrayFactory.class.getName());
//===============================================================
// Static
//===============================================================
/**
* Gets the standard filename for a Vocabulary.
*/
public static String getVocabularyFileName(String lang, String corpus) {
return(lang+"_"+corpus+"_vocab.txt");
}
/**
* Gets the standard filename for a CorpusArray.
*/
public static String getCorpusArrayFileName(String lang, String corpus) {
return (lang+"_"+corpus+"_sentences.txt");
}
/**
* Gets the standard filename for a SuffixArray.
*/
public static String getSuffixArrayFileName(String lang, String corpus) {
return (lang+"_"+corpus+"_suffixes.txt");
}
/**
* Gets the standard filename for an AlignmentArray.
*/
public static String getAlignmentArrayFileName(String sourceLang, String targetLang, String corpus) {
return (sourceLang+"_"+targetLang+"_"+corpus+"_alignment_array.txt");
}
// TODO: fuse createVocabulary and createCorpusArray together to avoid allocating the trivial array.
public static CorpusArray createCorpusArray(String inputFilename)
throws IOException {
Vocabulary vocabulary = new Vocabulary();
int[] ws = Vocabulary.initializeVocabulary(inputFilename, vocabulary, true);
return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
}
// TODO: fuse count and createCorpusArray together to avoid allocating the trivial array.
public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocabulary) throws IOException {
int[] ws = count(inputFilename);
return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
}
/**
* Counts the words and sentences in a plain text file.
*
* @param inputFilename the plain text file
* @return a tuple containing the number of words in the
* corpus and number of sentences in the corpus
*/
static int[] count(String inputFilename) throws IOException {
int numSentences = 0;
int numWords = 0;
LineReader lineReader = new LineReader(inputFilename);
for (String line : lineReader) {
String[] sentence = line.trim().split("\\s+");
numWords += sentence.length;
numSentences++;
if(SHOW_PROGRESS && numSentences % 10000==0) logger.info(""+numWords);
}
int[] numberOfWordsSentences = { numWords, numSentences };
return numberOfWordsSentences;
}
/**
* Creates a new CorpusArray from a plain text file, given
* a symbol table created from the same file.
*
* @param numWords the number of words in the file
* (returned by createVocabulary)
* @param numSentences the number of lines in the file
* (returned by createVocabulary)
*/
public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocab, int numWords, int numSentences) throws IOException {
// initialize the arrays.
int[] corpus = new int[numWords];
int[] sentenceIndexes = new int[numSentences];
// instantiate them.
int wordCounter = 0;
int sentenceCounter = 0;
LineReader lineReader = new LineReader(inputFilename);
for (String phraseString : lineReader) {
int[] words = vocab.getIDs(phraseString);
// String[] wordStrings = phraseString.split("\\s+");
// int[] words = new int[wordStrings.length];
// for (int i = 0; i < wordStrings.length; i++) {
// words[i] = vocab.getID(wordStrings[i]);
// }
//
// BasicPhrase sentence = new BasicPhrase(words, vocab);
sentenceIndexes[sentenceCounter] = wordCounter;
sentenceCounter++;
System.arraycopy(words, 0, corpus, wordCounter, words.length);
wordCounter += words.length;
//
// for(int i = 0; i < sentence.size(); i++) {
// corpus[wordCounter] = sentence.getWordID(i);
// wordCounter++;
// }
// if(SHOW_PROGRESS && sentenceCounter % 10000==0) logger.info(""+numWords);
}
return new CorpusArray(corpus, sentenceIndexes, vocab);
}
/**
* Creates a new SuffixArray from a CorpusArray created
* from the same file.
*/
public static SuffixArray createSuffixArray(Corpus corpusArray, int maxCacheSize) throws IOException {
return new SuffixArray(corpusArray, maxCacheSize);
}
/**
* Creates an Alignments object from a file containing
* Moses-style alignments, and a source and target corpus.
*/
public static Alignments createAlignments(String alignmentsFilename, Suffixes sourceCorpus, Suffixes targetCorpus) throws IOException {
// Maps indices from the target corpus to a list of their aligned source points
Map<Integer,List<Integer>> alignedSourceList = new HashMap<Integer,List<Integer>>();
// Maps indices from the source corpus to a list of their aligned target points
Map<Integer,List<Integer>> alignedTargetList = new HashMap<Integer,List<Integer>>();
// Maps indices from the target corpus to an array of their aligned source points
int[][] alignedSourceIndices = new int[targetCorpus.size()][];
// Maps indices from the source corpus to an array of their aligned target points
int[][] alignedTargetIndices = new int[sourceCorpus.size()][];
// set the values of the arrays based on the alignments file...
int sentenceCounter = 0;
LineReader lineReader = new LineReader(alignmentsFilename);
for (String line : lineReader) {
// Start index of current source sentence
int sourceOffset = sourceCorpus.getSentencePosition(sentenceCounter);
// Start index of current target sentence
int targetOffset = targetCorpus.getSentencePosition(sentenceCounter);
// To save memory, clear old items that will not be used again
alignedSourceList.clear();
alignedTargetList.clear();
// parse the alignment points
String[] alignments = line.split("\\s+");
for(int i = 0; i < alignments.length; i++) {
String[] points = alignments[i].split("-");
int sourceIndex = sourceOffset + Integer.parseInt(points[0]);
int targetIndex = targetOffset + Integer.parseInt(points[1]);
if (!alignedSourceList.containsKey(targetIndex)) {
ArrayList<Integer> list = new ArrayList<Integer>();
list.add(sourceIndex);
alignedSourceList.put(targetIndex, list);
} else {
alignedSourceList.get(targetIndex).add(sourceIndex);
}
if (!alignedTargetList.containsKey(sourceIndex)) {
ArrayList<Integer> list = new ArrayList<Integer>();
list.add(targetIndex);
alignedTargetList.put(sourceIndex, list);
} else {
alignedTargetList.get(sourceIndex).add(targetIndex);
}
}
int nextSourceOffset = sourceCorpus.getSentencePosition(sentenceCounter+1);
int nextTargetOffset = targetCorpus.getSentencePosition(sentenceCounter+1);
for (int i=targetOffset; i<nextTargetOffset; i++) {
if (alignedSourceList.containsKey(i)) {
// List of source points aligned to the target index i
List<Integer> sourceList = alignedSourceList.get(i);
Collections.sort(sourceList);
int size=sourceList.size();
alignedSourceIndices[i] = new int[size];
for (int j=0; j<size; j++) alignedSourceIndices[i][j] = sourceList.get(j);
} else {
alignedSourceIndices[i] = null;
}
}
for (int i=sourceOffset; i<nextSourceOffset; i++) {
if (alignedTargetList.containsKey(i)) {
// List of target points aligned to the source index i
List<Integer> targetList = alignedTargetList.get(i);
Collections.sort(alignedTargetList.get(i));
int size=targetList.size();
alignedTargetIndices[i] = new int[size];
for (int j=0; j<size; j++) alignedTargetIndices[i][j] = targetList.get(j);
} else {
alignedTargetIndices[i] = null;
}
}
sentenceCounter++;
}
return new AlignmentArray(alignedTargetIndices, alignedSourceIndices, sentenceCounter);
}
//===============================================================
// Private
//===============================================================
}