* @throws IOException
*/
public static void createTTableFromHooka(String srcVocabFile, String trgVocabFile, String tableFile, String finalSrcVocabFile,
String finalTrgVocabFile, String finalTableFile, float probThreshold, int numTrans, FileSystem fs) throws IOException{
logger.setLevel(Level.DEBUG);
Vocab srcVocab = HadoopAlign.loadVocab(new Path(srcVocabFile), fs);
Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);
logger.debug(ttable.getMaxE() + "," + ttable.getMaxF());
Vocab finalSrcVocab = new VocabularyWritable();
Vocab finalTrgVocab = new VocabularyWritable();
TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();
String srcTerm = null, trgTerm = null;
int curIndex = -1;
TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
float sumOfProbs = 0.0f, prob;
// int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0; // for statistical purposes only
HookaStats stats = new HookaStats(numTrans, probThreshold);
//modify current ttable wrt foll. criteria: top numTrans translations per source term, unless cumulative prob. distr. exceeds probThreshold before that.
for (int srcIndex = 1; srcIndex < srcVocab.size(); srcIndex++) {
int[] translations;
try {
translations = ttable.get(srcIndex).getTranslations(0f);
} catch (Exception e) {
logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
continue;
}
srcTerm = srcVocab.get(srcIndex);
curIndex = finalSrcVocab.addOrGet(srcTerm);
//initialize this term
topTrans.clear();
sumOfProbs = 0.0f;
logger.debug("Processing: " + srcTerm + " with index: " + curIndex + " ("+srcIndex+"); " + translations.length + " translations");
for (int trgIndex : translations) {
try {
trgTerm = trgVocab.get(trgIndex);
} catch (Exception e) {
logger.debug("Skipping " + trgIndex);
continue;
}
prob = ttable.get(srcIndex, trgIndex);
logger.debug("Found: " + trgTerm + " with " + prob);
topTrans.add(new PairOfFloatString(prob, trgTerm));
// keep top numTrans translations
if (topTrans.size() > numTrans) {
float removedProb = topTrans.pollFirst().getLeftElement();
sumOfProbs -= removedProb;
}
sumOfProbs += prob;
if (sumOfProbs > probThreshold) {
logger.debug("Sum of probs > "+probThreshold+", early termination.");
break;
}
}
//store previous term's top translations to ttable
if(topTrans.size() > 0){
addToTable(curIndex, topTrans, sumOfProbs, finalTTable, finalTrgVocab, probThreshold, stats);
}
}
System.err.println("Vocabulary Target: "+finalTrgVocab.size()+" elements");
System.err.println("Vocabulary Source: "+finalSrcVocab.size()+" elements");
System.err.println(stats);
FSDataOutputStream outputStream1 = fs.create(new Path(finalTrgVocabFile));
((VocabularyWritable) finalTrgVocab).write(outputStream1);