new BufferedInputStream(new FileInputStream(compressedDocuments)));
int documents = documentCounter.get();
// Use the number of times the term occurred in the corpus to determine
// how many rows (contexts) in the matrix.
SparseMatrix contextsForCurTerm = new YaleSparseMatrix(
termCounts.get(termIndex).get(), termToIndex.size());
int contextsSeen = 0;
for (int d = 0; d < documents; ++d) {
final int docId = d;
int tokensInDoc = corpusReader.readInt();
int unfilteredTokens = corpusReader.readInt();
// Read in the document
int[] doc = new int[tokensInDoc];
for (int i = 0; i < tokensInDoc; ++i)
doc[i] = corpusReader.readInt();
int contextsInDoc =
processIntDocument(termIndex, doc, contextsForCurTerm,
contextsSeen, termFeatures);
contextsSeen += contextsInDoc;
}
corpusReader.close();
// If the term is to be processed using fewer than all of its contexts,
// then randomly select the maximum allowable contexts from the matrix
if (maxContextsPerWord < Integer.MAX_VALUE &&
contextsForCurTerm.rows() > maxContextsPerWord) {
BitSet randomContexts = Statistics.randomDistribution(
maxContextsPerWord, contextsForCurTerm.rows());
contextsForCurTerm =
new SparseRowMaskedMatrix(contextsForCurTerm, randomContexts);
}
return contextsForCurTerm;