* @throws IOException
*/
public static void createTTableFromBerkeleyAligner(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, float probThreshold, int numTrans, FileSystem fs) throws IOException{
logger.setLevel(Level.INFO);
TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
File file = new File(inputFile);
FileInputStream fis = null;
BufferedReader bis = null;
// int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0
int cnt = 0; // for statistical purposes only
// float sumCumProbs = 0f; // for statistical purposes only
HookaStats stats = new HookaStats(numTrans, probThreshold);
//In BerkeleyAligner output, dictionary entries of each source term are already sorted by prob. value.
try {
fis = new FileInputStream(file);
bis = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
String cur = null;
boolean earlyTerminate = false;
String line = "";
while (true) {
if(!earlyTerminate){
line = bis.readLine();
if(line ==null)
break;
cnt++;
}
earlyTerminate = false;
logger.debug("Line:"+line);
Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans");
Matcher m = p.matcher(line);
if(m.find()){
cur = m.group(1);
int gerIndex = srcVocab.addOrGet(cur);
logger.debug("Found: "+cur+" with index: "+gerIndex);
List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
float sumOfProbs = 0.0f;
for(int i=0;i<numTrans;i++){
if((line=bis.readLine())!=null){
cnt++;
Pattern p2 = Pattern.compile("\\s*(\\S+): (.+)");
Matcher m2 = p2.matcher(line);
if(!m2.find()){
m = p.matcher(line);
if(m.find()){
logger.debug("Early terminate");
earlyTerminate = true;
i = numTrans;
break;
}
// logger.debug("FFFF"+line);
}else{
String term = m2.group(1);
if (!term.equals("NULL")) {
float prob = Float.parseFloat(m2.group(2));
int engIndex = trgVocab.addOrGet(term);
logger.debug("Added: "+term+" with index: "+engIndex+" and prob:"+prob);
indexProbPairs.add(new PairOfIntFloat(engIndex, prob));
sumOfProbs+=prob;
}
}
}
if(sumOfProbs > probThreshold){
stats.incCntShortTail(1);
stats.incSumShortTail(i+1);
break;
}
}
if(sumOfProbs <= probThreshold){
// early termination
stats.incCntLongTail(1);
stats.incSumCumProbs(sumOfProbs);
}
// to enable faster access with binary search, we sort entries by vocabulary index.
Collections.sort(indexProbPairs);
int i=0;
int numEntries = indexProbPairs.size();
int[] indices = new int[numEntries];
float[] probs = new float[numEntries];
for(PairOfIntFloat pair : indexProbPairs){
indices[i] = pair.getLeftElement();
probs[i++] = pair.getRightElement()/sumOfProbs;
}
table.set(gerIndex, new IndexedFloatArray(indices, probs, true));
}
}
// dispose all the resources after using them.
fis.close();
bis.close();
// dis.close();
}catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("File "+inputFile+": read "+cnt+" lines");
logger.info("Vocabulary Target: "+trgVocab.size()+" elements");
logger.info("Vocabulary Source: "+srcVocab.size()+" elements");
logger.info(stats);
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream
(fs.create(new Path(trgVocabFile))));
((VocabularyWritable) trgVocab).write(dos);
dos.close();
DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream
(fs.create(new Path(srcVocabFile))));
((VocabularyWritable) srcVocab).write(dos2);
dos2.close();
DataOutputStream dos3 = new DataOutputStream(new BufferedOutputStream
(fs.create(new Path(probsFile))));
table.write(dos3);
dos3.close();
}