}
return token;
}
protected HMapSFW getTranslations(String query, String token, Set<PairOfStrings> pairsInSCFG, Map<String, String> stemmed2Stemmed) {
HMapSFW probDist = new HMapSFW();
int f = fVocab_f2e.get(token);
if (f <= 0) {
// heuristic: if no translation found, include itself as only translation
String target = (stemmed2Stemmed == null) ? token : stemmed2Stemmed.get(token);
probDist.put(target, 1);
return probDist;
}
PriorityQueue<PairOfFloatInt> eS = f2eProbs.get(f).getTranslationsWithProbs(lexProbThreshold);
// LOG.info("Adding "+ eS.size() +" translations for "+token+","+f);
float sumProbEF = 0;
int numTrans = 0;
//tf(e) = sum_f{tf(f)*prob(e|f)}
while (numTrans < numTransPerToken && !eS.isEmpty()) {
PairOfFloatInt entry = eS.poll();
float probEF = entry.getLeftElement();
int e = entry.getRightElement();
String eTerm = eVocab_f2e.get(e);
// LOG.info("Pr("+eTerm+"|"+token+")="+probEF);
if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (translateOnly == null || !translateOnly.equals("indri") || indriPuncPattern.matcher(eTerm).matches()) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {
// assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
// then we need to convert the translations of each source token into a sequence of bigrams
// we can distribute the translation probability equally to the each bigram
if (bigramSegment) {
String[] eTokens = docLangTokenizer.processContent(eTerm);
float splitProb = probEF / eTokens.length;
for (String eToken : eTokens) {
// heuristic: only keep translations that are in our collection
// exception: index might not be specified if running in --translate_only mode (in that case, we cannot run this heuristic)
if (env == null || env.getPostingsList(eToken) != null) {
probDist.put(eToken, splitProb);
}
}
// here we add probability for tokens that we ignored in above condition,
// but it works better (empirically) this way
// AND it is consistent with what we would get if we did not do the index-filtering above
// only faster
sumProbEF += probEF;
}else {
// heuristic: only keep translations that are in our collection
// exception: index might not be specified if running in --translate_only mode (in that case, we cannot run this heuristic)
if (env == null || env.getPostingsList(eTerm) != null) {
probDist.increment(eTerm, probEF);
sumProbEF += probEF;
}
}
numTrans++;
}else{
LOG.info("Skipped target stopword/OOV " + eTerm);
}
// early terminate if cumulative prob. has reached specified threshold
if (sumProbEF > cumProbThreshold || numTrans >= numTransPerToken) {
break;
}
}
// normalize weights
for(String e : probDist.keySet()){
probDist.put(e, probDist.get(e) / sumProbEF);
}
// LOG.info("Translations of "+token+"="+probDist);
return probDist;