String lang = p.getLanguage();
int langID;
ArrayListWritable<Text> sentences;
ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
// identify sentences in document, filter out ones below MinSentLength threshold
// convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
// filter out sentences for which the vector has less than MinVectorTerms terms
try {
String article = p.getContent();
if (lang.equals(eLang)) {
sentences = helper.getESentences(article, vectors, sentLengths);
langID = CLIRUtils.E;
}else if (lang.equals(fLang)){
// Turkish Wiki articles' XML does not encode paragraph breaks
if (lang.equals("tr")) {
article = article.replaceAll("\\.", ". ");
}
sentences = helper.getFSentences(article, vectors, sentLengths);
langID = CLIRUtils.F;
}else {
throw new RuntimeException("Unknown language: " + lang);
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error in sentence detection for language: " + lang + ", helper: " + helper + " article title: " + p.getTitle());
}
// documents with no sentences (after we filter out some by length)
if (sentences.size() == 0) {
if (langID == CLIRUtils.E) {
reporter.incrCounter(Docs.EEmpty, 1);
}else {
reporter.incrCounter(Docs.FEmpty, 1);
}
}else{
if (langID == CLIRUtils.E) {
reporter.incrCounter(Docs.E, 1);
}else {
reporter.incrCounter(Docs.F, 1);
}
}
for (int i = 0; i < sentences.size(); i++) {
if (langID == CLIRUtils.E) {
if (helper.getEOOVRate(sentences.get(i).toString()) > maxOOVRate ) {
reporter.incrCounter(Sentences.OOV, 1);
continue;
}
reporter.incrCounter(Sentences.ELength, sentLengths.get(i));
reporter.incrCounter(Sentences.E, 1);
}else {
if (helper.getFOOVRate(sentences.get(i).toString()) > maxOOVRate ) {
reporter.incrCounter(Sentences.OOV, 1);
continue;
}
reporter.incrCounter(Sentences.FLength, sentLengths.get(i));
reporter.incrCounter(Sentences.F, 1);
}
keyOut.set(docno, langID);
valOut.set(langID, sentences.get(i), vectors.get(i));
output.collect(keyOut, valOut);