}
}
public void map(Writable docnoKey, Indexable page, OutputCollector<PairOfInts, WikiDocInfo> output, Reporter reporter) throws IOException {
int docno = ((IntWritable)docnoKey).get();
WikipediaPage p = (WikipediaPage) page;
String lang = p.getLanguage();
ArrayListOfIntsWritable similarDocnos;
// we only load the mapping once, during the first map() call of a mapper.
// this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above)
if(pwsimMapping.isEmpty()){
loadPairs(pwsimMapping, lang, mJob, reporter);
sLogger.debug(pwsimMapping.size());
}
// if no similar docs for docno, return
if(pwsimMapping.containsKey(docno)){
similarDocnos = pwsimMapping.get(docno);
}else{
return;
}
ArrayListWritable<Text> sentences;
ArrayListWritable<HMapSFW> vectors = new ArrayListWritable<HMapSFW>();
ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable();
try {
if(lang.equals("en")){
// identify sentences in document, filter out ones below MinSentLength threshold
// convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length
// filter out sentences for which the vector has less than MinVectorTerms terms
sentences = helper.getESentences(p.getContent(), vectors, sentLengths);
}else{
sentences = helper.getFSentences(p.getContent(), vectors, sentLengths);
}
if(sentences.size() != vectors.size()) {
throw new RuntimeException("Sentences.size != Vectors.size");
}
} catch (Exception e) {