{
String binaryVocabFilename = outputDirName + File.separator + "common.vocab";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary common vocabulary to disk at " + binaryVocabFilename);
ObjectOutput vocabOut =
new BinaryOut(new FileOutputStream(binaryVocabFilename), true);
symbolTable.setExternalizableEncoding(charset);
symbolTable.writeExternal(vocabOut);
vocabOut.flush();
out.println("Common symbol table for source and target language: " + binaryVocabFilename);
}
// Construct source language corpus
if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + sourceCorpusFileName);
CorpusArray sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
// Write source corpus to disk
{
String binarySourceCorpusFilename = outputDirName + File.separator + "source.corpus";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceCorpusFilename);
BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binarySourceCorpusFilename), false);
sourceCorpusArray.writeExternal(corpusOut);
corpusOut.flush();
out.println("Source language corpus: " + binarySourceCorpusFilename);
}
// Construct target language corpus
if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + targetCorpusFileName);
CorpusArray targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
// Write target language corpus to disk
{
String binaryTargetCorpusFilename = outputDirName + File.separator + "target.corpus";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetCorpusFilename);
BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binaryTargetCorpusFilename), false);
targetCorpusArray.writeExternal(corpusOut);
corpusOut.flush();
out.println("Target language corpus: " + binaryTargetCorpusFilename);
}
{
// Construct alignments data structure
AlignmentGrids grids = new AlignmentGrids(
new Scanner(new File(alignmentsFileName)),
sourceCorpusArray,
targetCorpusArray,
numberOfSentences);
// Write alignments to disk
{
String binaryAlignmentsFilename = outputDirName + File.separator + "alignment.grids";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary alignment grids to disk at " + binaryAlignmentsFilename);
BinaryOut alignmentsOut = new BinaryOut(binaryAlignmentsFilename);
grids.writeExternal(alignmentsOut);
alignmentsOut.flush();
alignmentsOut.close();
out.println("Source-target alignment grids: " + binaryAlignmentsFilename);
}
// Write lexprobs to disk
{
ParallelCorpus parallelCorpus = new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, grids);
if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexprob table");
LexicalProbabilities lexProbs =
new LexProbs(parallelCorpus, Float.MIN_VALUE);
String lexprobsFilename = outputDirName + File.separator + "lexprobs.txt";
FileOutputStream stream = new FileOutputStream(lexprobsFilename);
OutputStreamWriter lexprobsOut = new OutputStreamWriter(stream, charset);
String binaryLexCountFilename = outputDirName + File.separator + "lexicon.counts";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary lexicon counts to disk at " + binaryLexCountFilename);
// BinaryOut lexCountOut = new BinaryOut(binaryLexCountFilename);
ObjectOutput lexCountOut = new ObjectOutputStream(new FileOutputStream(binaryLexCountFilename));
lexProbs.writeExternal(lexCountOut);
lexCountOut.close();
String s = lexProbs.toString();
if (logger.isLoggable(Level.INFO)) logger.info("Writing lexprobs at " + lexprobsFilename);
lexprobsOut.write(s);
lexprobsOut.flush();
lexprobsOut.close();
out.println("Lexprobs at " + lexprobsFilename);
}
}
// Write target language suffix array to disk
{
// Construct target language suffix array
if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + targetCorpusFileName);
SuffixArray targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, Cache.DEFAULT_CAPACITY);
String binaryTargetSuffixesFilename = outputDirName + File.separator + "target.suffixes";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetSuffixesFilename);
BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binaryTargetSuffixesFilename), false);
targetSuffixArray.writeExternal(suffixesOut);
suffixesOut.flush();
out.println("Target language suffix array: " + binaryTargetSuffixesFilename);
}
{
// Construct source language suffix array
if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + sourceCorpusFileName);
SuffixArray sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, Cache.DEFAULT_CAPACITY);
// Write source language suffix array to disk
{
String binarySourceSuffixesFilename = outputDirName + File.separator + "source.suffixes";
if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceSuffixesFilename);
BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binarySourceSuffixesFilename), false);
sourceSuffixArray.writeExternal(suffixesOut);
suffixesOut.flush();
out.println("Source language suffix array: " + binarySourceSuffixesFilename);
}
// Precompute and write frequent phrase locations to disk
{
if (logger.isLoggable(Level.INFO)) logger.info("Precomputing indices for most frequent phrases");
FrequentPhrases frequentPhrases =
new FrequentPhrases(sourceSuffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);
String frequentPhrasesFilename = outputDirName + File.separator + "frequentPhrases";
if (logger.isLoggable(Level.INFO)) logger.info("Writing precomputing indices for most frequent phrases at " + frequentPhrasesFilename);
BinaryOut frequentPhrasesOut = new BinaryOut(frequentPhrasesFilename);
frequentPhrases.writeExternal(frequentPhrasesOut);
frequentPhrasesOut.close();
}
}
out.flush();
out.close();