* @throws IOException
*/
public static void createTTableFromGIZA(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile,
float probThreshold, int numTrans, FileSystem fs) throws IOException{
TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
int cnt = 0;
//In GIZA output, dictionary entries are in random order (w.r.t. prob value), so you need to keep a sorted list of top numTrans or less entries w/o exceeding <probThreshold> probability
try {
DataInputStream d = new DataInputStream(fs.open(new Path(inputFile)));
BufferedReader inputReader = new BufferedReader(new InputStreamReader(d));
String srcTerm = null, trgTerm = null, prev = null;
int curIndex = -1;
TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
String line = "";
boolean earlyTerminate = false, skipTerm = false;
float sumOfProbs = 0.0f, prob;
HookaStats stats = new HookaStats(numTrans, probThreshold);
while (true) {
// line = bis.readLine();
line = inputReader.readLine();
if(line == null) break;
String[] parts = line.split(" ");
if(parts.length != 3){
throw new RuntimeException("Unknown format: "+cnt+" = \n"+line);
}
cnt++;
trgTerm = parts[0];
srcTerm = parts[1];
prob = Float.parseFloat(parts[2]);
if (trgTerm.equals("NULL")) {
continue; // skip alignments to imaginary NULL word
}
// new source term (ignore punctuation)
if ((prev==null || !srcTerm.equals(prev)) && !delims.contains(srcTerm)){
if(topTrans.size() > 0){
// store previous term's top translations to ttable
addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
}
logger.debug("Line:"+line);
// initialize the translation distribution of the source term
sumOfProbs = 0.0f;
topTrans.clear();
earlyTerminate = false; // reset status
skipTerm = false;
prev = srcTerm;
int prevIndex = curIndex;
curIndex = srcVocab.addOrGet(srcTerm);
if(curIndex <= prevIndex){
// we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
logger.debug("FLAG: "+line);
curIndex = prevIndex; // revert curIndex value since we're skipping this one
skipTerm = true;
continue;
}
logger.debug("Processing: "+srcTerm+" with index: "+curIndex);
topTrans.add(new PairOfFloatString(prob, trgTerm));
sumOfProbs += prob;
logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");
}else if(!earlyTerminate && !skipTerm && !delims.contains(srcTerm)){ //continue adding translation term,prob pairs (except if early termination is ON)
topTrans.add(new PairOfFloatString(prob, trgTerm));
sumOfProbs += prob;
logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");
// keep top numTrans translations
if(topTrans.size() > numTrans){
PairOfFloatString pair = topTrans.pollFirst();
float removedProb = pair.getLeftElement();
sumOfProbs -= removedProb;
logger.debug("Removed from queue: "+pair.getRightElement()+" (sum: "+sumOfProbs+")");
}
}else{
logger.debug("Skipped line: "+line);
}
}
//last one
if(topTrans.size()>0){
//store previous term's top translations to ttable
addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
}
// dispose all the resources after using them.
inputReader.close();
System.err.println("File " + inputFile + ": read " + cnt + " lines");
System.err.println("Vocabulary Target: " + trgVocab.size() + " elements");
System.err.println("Vocabulary Source: " + srcVocab.size() + " elements");
System.err.println(stats);
}catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();