if (logger.isLoggable(Level.FINE)) {
logger.fine("Counting word co-occurrence from parallel corpus. Using floor probability " + floorProbability);
}
Alignments alignments = parallelCorpus.getAlignments();
Corpus sourceCorpus = parallelCorpus.getSourceCorpus();
Corpus targetCorpus = parallelCorpus.getTargetCorpus();
int numSentences = parallelCorpus.getNumSentences();
Counts<Integer,Integer> counts = new Counts<Integer,Integer>(floorProbability);
// Iterate over each sentence
for (int sentenceID=0; sentenceID<numSentences; sentenceID++) {
int sourceStart = sourceCorpus.getSentencePosition(sentenceID);
int sourceEnd = sourceCorpus.getSentenceEndPosition(sentenceID);
int targetStart = targetCorpus.getSentencePosition(sentenceID);
int targetEnd = targetCorpus.getSentenceEndPosition(sentenceID);
// Iterate over each word in the source sentence
for (int sourceIndex=sourceStart; sourceIndex<sourceEnd; sourceIndex++) {
// Get the token for the current source word
int sourceWord = sourceCorpus.getWordID(sourceIndex);
// Get the target indices aligned to this source word
int[] targetPoints = alignments.getAlignedTargetIndices(sourceIndex);
// If the source word is unaligned,
// then we treat it as being aligned to a special NULL token;
// we use Java's null to represent the NULL token
if (targetPoints==null) {
counts.incrementCount(sourceWord, null);
} else {
// If the source word is aligned,
// then we must iterate over each aligned target point
for (int targetPoint : targetPoints) {
int targetWord = targetCorpus.getWordID(targetPoint);
counts.incrementCount(sourceWord, targetWord);
}
}
}
// Iterate over each word in the target sentence
for (int targetIndex=targetStart; targetIndex<targetEnd; targetIndex++) {
// Get the token for the current source word
int targetWord = targetCorpus.getWordID(targetIndex);
// Get the source indices aligned to this target word
int[] sourcePoints = alignments.getAlignedSourceIndices(targetIndex);
// If the source word is unaligned,
// then we treat it as being aligned to a special NULL token;
// we use Java's null to represent the NULL token
if (sourcePoints==null) {