if (!it.hasNext()) {
return;
}
StringTuple value = it.next();
Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size
if (maxNGramSize >= 2) {
ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
do {
String term = (sf.getAttribute(TermAttribute.class)).term();
if (term.length() > 0 && dictionary.containsKey(term)) { // ngram
int termId = dictionary.get(term);
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
} while (sf.incrementToken());
sf.end();
sf.close();
} else {
for (String term : value.getEntries()) {
if (term.length() > 0 && dictionary.containsKey(term)) { // unigram
int termId = dictionary.get(term);
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
}
}
if (sequentialAccess) {
vector = new SequentialAccessSparseVector(vector);
}
if (namedVector) {
vector = new NamedVector(vector, key.toString());
}
// if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
if (vector.getNumNondefaultElements() > 0) {
VectorWritable vectorWritable = new VectorWritable(vector);
context.write(key, vectorWritable);
} else {
context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1);
}