* @throws IOException
*/
public static void createTTableFromGIZA(String filename, String srcVocabFile, String trgVocabFile, String probsFile, float probThreshold, int numTrans, FileSystem fs) throws IOException{
logger.setLevel(Level.INFO);
TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
File file = new File(filename);
FileInputStream fis = null;
BufferedReader bis = null;
int cnt = 0;
//In GIZA output, dictionary entries are in random order (w.r.t. prob value), so you need to keep a sorted list of top numTrans or less entries w/o exceeding <probThreshold> probability
try {
fis = new FileInputStream(file);
bis = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
String srcTerm = null, trgTerm = null, prev = null;
int curIndex = -1;
TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
String line = "";
boolean earlyTerminate = false, skipTerm = false;
float sumOfProbs = 0.0f, prob;//, sumCumProbs = 0;
// int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0; // for statistical purposes only
HookaStats stats = new HookaStats(numTrans, probThreshold);
while (true) {
line = bis.readLine();
if(line == null) break;
String[] parts = line.split(" ");
if(parts.length != 3){
throw new RuntimeException("Unknown format: "+cnt+" = \n"+line);
}
cnt++;
trgTerm = parts[0];
srcTerm = parts[1];
prob = Float.parseFloat(parts[2]);
if (trgTerm.equals("NULL")) {
continue; // skip alignments to imaginary NULL word
}
// new source term (ignore punctuation)
if ((prev==null || !srcTerm.equals(prev)) && !delims.contains(srcTerm)){
if(topTrans.size() > 0){
// store previous term's top translations to ttable
addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
}
logger.debug("Line:"+line);
// initialize the translation distribution of the source term
sumOfProbs = 0.0f;
topTrans.clear();
earlyTerminate = false; // reset status
skipTerm = false;
prev = srcTerm;
int prevIndex = curIndex;
curIndex = srcVocab.addOrGet(srcTerm);
if(curIndex <= prevIndex){
// we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
logger.debug("FLAG: "+line);
curIndex = prevIndex; // revert curIndex value since we're skipping this one
skipTerm = true;
continue;
}
logger.debug("Processing: "+srcTerm+" with index: "+curIndex);
topTrans.add(new PairOfFloatString(prob, trgTerm));
sumOfProbs += prob;
logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");
}else if(!earlyTerminate && !skipTerm && !delims.contains(srcTerm)){ //continue adding translation term,prob pairs (except if early termination is ON)
topTrans.add(new PairOfFloatString(prob, trgTerm));
sumOfProbs += prob;
logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");
// keep top numTrans translations
if(topTrans.size() > numTrans){
PairOfFloatString pair = topTrans.pollFirst();
float removedProb = pair.getLeftElement();
sumOfProbs -= removedProb;
logger.debug("Removed from queue: "+pair.getRightElement()+" (sum: "+sumOfProbs+")");
}
}else{
logger.debug("Skipped line: "+line);
}
// // line processed: check if early terminate
// if(sumOfProbs > probThreshold){
// earlyTerminate = true;
// logger.debug("Sum of probs > "+probThreshold+", early termination.");
// }
}
//last one
if(topTrans.size()>0){
//store previous term's top translations to ttable
addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
}
// dispose all the resources after using them.
fis.close();
bis.close();
logger.info("File "+filename+": read "+cnt+" lines");
logger.info("Vocabulary Target: "+trgVocab.size()+" elements");
logger.info("Vocabulary Source: "+srcVocab.size()+" elements");
logger.info(stats);
}catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(trgVocabFile))));
((VocabularyWritable) trgVocab).write(dos);
dos.close();
DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(srcVocabFile))));
((VocabularyWritable) srcVocab).write(dos2);
dos2.close();
DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(probsFile))));
table.write(dos3);
dos3.close();
}