if (values.hasNext() == false) {
return;
}
StringTuple value = values.next();
Vector vector = new RandomAccessSparseVector(key.toString(), dimension, value.length()); // guess at
// initial size
if (maxNGramSize >= 2) {
ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
maxNGramSize);
do {
String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
if (term.length() > 0) { // ngram
if (dictionary.containsKey(term) == false) {
continue;
}
int termId = dictionary.get(term);
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
} while (sf.incrementToken());
sf.end();
sf.close();
} else {
for (String term : value.getEntries()) {
if (term.length() > 0) { // unigram
if (dictionary.containsKey(term) == false) {
continue;
}
int termId = dictionary.get(term);
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
}
}
if (sequentialAccess) {
vector = new SequentialAccessSparseVector(vector);
}
// if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk.
if(vector.getNumNondefaultElements() > 0) {
vectorWritable.set(vector);
output.collect(key, vectorWritable);
} else {
reporter.incrCounter("TFParticalVectorReducer", "emptyVectorCount", 1);
}