* @throws IOException
*/
public static void createTTableFromGIZA(String filename, String srcVocabFile, String trgVocabFile, String probsFile, FileSystem fs) throws IOException{
logger.setLevel(Level.INFO);
TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
File file = new File(filename);
FileInputStream fis = null;
BufferedReader bis = null;
int cnt = 0;
//In GIZA output, dictionary entries are in random order (w.r.t. prob value), so you need to keep a sorted list of top NUM_TRANS or less entries w/o exceeding MaxProb threshold
try {
fis = new FileInputStream(file);
bis = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
String srcTerm = null, trgTerm = null, prev = null;
int curIndex = -1;
TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
String line = "";
boolean earlyTerminate = false, skipTerm = false;
float sumOfProbs = 0.0f, prob;
int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0; // for statistical purposes only
while (true) {
line = bis.readLine();
if(line == null) break;
String[] parts = line.split(" ");
if(parts.length != 3){
throw new RuntimeException("Unknown format: "+line);
}
cnt++;
trgTerm = parts[0];
srcTerm = parts[1];
prob = Float.parseFloat(parts[2]);
if(prev==null || !srcTerm.equals(prev)){
if(topTrans.size() > 0){
//store previous term's top translations to ttable
int finalNumTrans = addToTable(curIndex, topTrans, table, trgVocab);
if(finalNumTrans < NUM_TRANS){
cntShortTail++;
sumShortTail += finalNumTrans;
}else{
cntLongTail++;
}
}
logger.debug("Line:"+line);
//initialize this term
sumOfProbs = 0.0f;
topTrans.clear();
earlyTerminate = false; //reset status
skipTerm = false;
prev = srcTerm;
int prevIndex = curIndex;
curIndex = srcVocab.addOrGet(srcTerm);
if(curIndex <= prevIndex){
//we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
curIndex = prevIndex; //revert curIndex value since we're skipping this one
skipTerm = true;
continue;
}
logger.debug("Processing: "+srcTerm+" with index: "+curIndex);
topTrans.add(new PairOfFloatString(prob, trgTerm));
sumOfProbs += prob;
}else if(!earlyTerminate && !skipTerm){ //continue adding translation term,prob pairs (except if early termination is ON)
topTrans.add(new PairOfFloatString(prob, trgTerm));
// keep top NUM_TRANS translations
if(topTrans.size()>NUM_TRANS){
float removedProb = topTrans.pollFirst().getLeftElement();
sumOfProbs -= removedProb;
}
sumOfProbs += prob;
}else{
logger.debug("Skipped");
}
if(sumOfProbs > PROB_THRESHOLD){
earlyTerminate = true;
logger.debug("Sum of probs > "+PROB_THRESHOLD+", early termination.");
}
}
if(topTrans.size()>0){
//store previous term's top translations to ttable
int finalNumTrans = addToTable(curIndex, topTrans, table, trgVocab);
if(finalNumTrans < NUM_TRANS){
cntShortTail++;
sumShortTail += finalNumTrans;
}else{
cntLongTail++;
}
}
// dispose all the resources after using them.
fis.close();
bis.close();
logger.info("Vocabulary Target: "+trgVocab.size()+" elements");
logger.info("Vocabulary Source: "+srcVocab.size()+" elements");
logger.info("# source terms with > "+PROB_THRESHOLD+" probability covered: "+cntShortTail+" and average translations per term: "+(sumShortTail/(cntShortTail+0.0f)));
logger.info("# source terms with <= "+PROB_THRESHOLD+" probability covered: "+cntLongTail+" (each has "+ NUM_TRANS +" translations)");
}catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(trgVocabFile))));
((VocabularyWritable) trgVocab).write(dos);
dos.close();
DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(srcVocabFile))));
((VocabularyWritable) srcVocab).write(dos2);
dos2.close();
DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(probsFile))));
table.write(dos3);
dos3.close();
}