Reporter reporter) throws IOException {
//key: a single sentence in both languages and alignment
//ignore value. each key is parallel sentence and its alignment, in xml format
ParallelChunk c = pcr.parseString(key.toString());
ok.set(c.idString());
//Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
Chunk fc = c.getChunk(src);
Chunk ec = c.getChunk(tgt);
if (fc == null || ec == null) {
reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
return;
}
if (fc.getLength() > 200) {
reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
return;
}
if (ec.getLength() > 200) {
reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
return;
}
//ec,fc: English/French sentence represented as sequence of words
//vocE,vocF: vocabularies for english and french, of type VocabularyWritable
//ee,fe: integer representation of words in sentences ec and fc
sLogger.debug("Target sentence:");
int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
sLogger.debug("Source sentence:");
int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);
//e,f: phrase from whole sentence
Phrase e = new Phrase(ee, 0);
Phrase f = new Phrase(fe, 1);
edu.umd.hooka.PhrasePair b = new PhrasePair(f,e);
ReferenceAlignment ra = c.getReferenceAlignment(lp);
if (ra != null) {
b.setAlignment(ra);
}
reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);