}
// in SCFG rule such as a b X1 X2 c --> X1 d e X2 f, we want to find out the src/trg tokens that are aligned to some trg/src token, ignoring the X variable
// we can then decide if we want to include it as a multi-token phrase in our query representation based on various heuristics (e.g., only include if no X in between of tokens)
String fPhrase = "";
ArrayListOfInts sourceTokenIds = new ArrayListOfInts();
ArrayListOfInts targetTokenIds = new ArrayListOfInts();
int f=0;
for (; f < lhs.length; f++) {
String fTerm = lhs[f];
if (queryLangTokenizer.isStopWord(fTerm) || fTerm.matches("\\[X,\\d+\\]") || fTerm.matches("<s>") || fTerm.matches("</s>")) {
continue;
}
srcTokenCnt.increment(fTerm);
sourceTokenIds.add(f);
ArrayListOfInts ids;
if (isPassThrough){
ids = new ArrayListOfInts();
ids.add(0);
}else {
ids = one2manyAlign.get(f);
}
if (ids == null || (isOne2Many == 0 && ids.size() > 1)) {
continue;
}
// find phrase in LHS and match to phrase in RHS
if (isMany2Many) {
fPhrase += fTerm + " ";
targetTokenIds = targetTokenIds.mergeNoDuplicates(ids);
}
String eTerm = null;
for (int e : ids) {
eTerm = rhs[e];
// assumption: if this is pass-through rule, re-stem token in doc-language
if (isPassThrough || (unknownWords != null && unknownWords.contains(fTerm))) {
eTerm = stemmed2Stemmed.get(eTerm);
}
if (eTerm == null || docLangTokenizer.isStopWord(eTerm)) {
// LOG.info("Skipped trg token " + eTerm);
eTerm = null;
continue;
}
bagOfTargetTokens.add(eTerm);
if (isOne2Many <= 1) {
if (probDist.containsKey(fTerm)) {
HMapSFW eToken2Prob = probDist.get(fTerm);
eToken2Prob.increment(eTerm, weight);
}else {
HMapSFW eToken2Prob = new HMapSFW();
eToken2Prob.put(eTerm, weight);
probDist.put(fTerm, eToken2Prob);
}
}
}
if (isOne2Many == 2) {
// if ids.size() > 1 eTerm is a multi-token expression
// even if eTerm is overwritten here, we need to do above loop to update bagOfTargetTokens
if (ids.size() > 1) {
eTerm = isConsecutiveWithStopwords(ids, rhs, docLangTokenizer); // <---- heuristic
}
// no proper translation on target-side (e.g., stopword OR non-consecutive multi-word translation), let's skip
if (eTerm == null) {