* @throws IOException
*/
public static void createTTableFromHooka(String srcVocabFile, String trgVocabFile, String tableFile, String finalSrcVocabFile, String finalTrgVocabFile, String finalTableFile, FileSystem fs) throws IOException{
logger.setLevel(Level.INFO);
Vocab srcVocab = HadoopAlign.loadVocab(new Path(srcVocabFile), fs);
Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);
Vocab finalSrcVocab = new VocabularyWritable();
Vocab finalTrgVocab = new VocabularyWritable();
TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();
String srcTerm = null, trgTerm = null;
int curIndex = -1;
TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
float sumOfProbs = 0.0f, prob;
int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0; // for statistical purposes only
//modify current ttable wrt foll. criteria: top NUM_TRANS translations per source term, unless cumulative prob. distr. exceeds PROB_THRESHOLD before that.
for(int srcIndex=1; srcIndex<srcVocab.size(); srcIndex++){
int[] translations;
try {
translations = ttable.get(srcIndex).getTranslations(0.0f);
} catch (Exception e) {
logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
continue;
}
srcTerm = srcVocab.get(srcIndex);
curIndex = finalSrcVocab.addOrGet(srcTerm);
//initialize this term
topTrans.clear();
sumOfProbs = 0.0f;
logger.debug("Processing: "+srcTerm+" with index: "+curIndex+" ("+srcIndex+")");
for(int trgIndex : translations){
trgTerm = trgVocab.get(trgIndex);
prob = ttable.get(srcIndex, trgIndex);
topTrans.add(new PairOfFloatString(prob, trgTerm));
// keep top NUM_TRANS translations
if(topTrans.size() > NUM_TRANS){
float removedProb = topTrans.pollFirst().getLeftElement();
sumOfProbs -= removedProb;
}
sumOfProbs += prob;
if(sumOfProbs > PROB_THRESHOLD){
logger.debug("Sum of probs > "+PROB_THRESHOLD+", early termination.");
break;
}
}
//store previous term's top translations to ttable
if(topTrans.size() > 0){
int finalNumTrans = addToTable(curIndex, topTrans, finalTTable, finalTrgVocab);
if(finalNumTrans < NUM_TRANS){
cntShortTail++;
sumShortTail += finalNumTrans;
}else{
cntLongTail++;
}
}
}
logger.info("Vocabulary Target: "+finalTrgVocab.size()+" elements");
logger.info("Vocabulary Source: "+finalSrcVocab.size()+" elements");
logger.info("# source terms with > "+PROB_THRESHOLD+" probability covered: "+cntShortTail+" and average translations per term: "+(sumShortTail/(cntShortTail+0.0f)));
logger.info("# source terms with <= "+PROB_THRESHOLD+" probability covered: "+cntLongTail+" (each has "+ NUM_TRANS +" translations)");
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(finalTrgVocabFile))));