final byte [] stemsFieldIndices = preprocessingContext.allStems.fieldIndices;
if (documentCount == 0)
{
vsmContext.termDocumentMatrix = new DenseDoubleMatrix2D(0, 0);
vsmContext.stemToRowIndex = new IntIntOpenHashMap();
return;
}
// Determine the index of the title field
int titleFieldIndex = -1;
final String [] fieldsName = preprocessingContext.allFields.name;
for (int i = 0; i < fieldsName.length; i++)
{
if (Document.TITLE.equals(fieldsName[i]))
{
titleFieldIndex = i;
break;
}
}
// Determine the stems we, ideally, should include in the matrix
int [] stemsToInclude = computeRequiredStemIndices(preprocessingContext);
// Sort stems by weight, so that stems get included in the matrix in the order
// of frequency
final double [] stemsWeight = new double [stemsToInclude.length];
for (int i = 0; i < stemsToInclude.length; i++)
{
final int stemIndex = stemsToInclude[i];
stemsWeight[i] = termWeighting.calculateTermWeight(stemsTf[stemIndex],
stemsTfByDocument[stemIndex].length / 2, documentCount)
* getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]);
}
final int [] stemWeightOrder = IndirectSort.mergesort(0, stemsWeight.length,
new IndirectComparator.DescendingDoubleComparator(stemsWeight));
// Calculate the number of terms we can include to fulfill the max matrix size
final int maxRows = maximumMatrixSize / documentCount;
final DoubleMatrix2D tdMatrix = new DenseDoubleMatrix2D(Math.min(maxRows,
stemsToInclude.length), documentCount);
for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++)
{
final int stemIndex = stemsToInclude[stemWeightOrder[i]];
final int [] tfByDocument = stemsTfByDocument[stemIndex];
final int df = tfByDocument.length / 2;
final byte fieldIndices = stemsFieldIndices[stemIndex];
for (int j = 0; j < df; j++) {
double weight = termWeighting.calculateTermWeight(
tfByDocument[j * 2 + 1], df, documentCount);
weight *= getWeightBoost(titleFieldIndex, fieldIndices);
tdMatrix.set(i, tfByDocument[j * 2], weight);
}
}
// Convert stemsToInclude into tdMatrixStemIndices
final IntIntOpenHashMap stemToRowIndex = new IntIntOpenHashMap();
for (int i = 0; i < stemWeightOrder.length && i < tdMatrix.rows(); i++)
{
stemToRowIndex.put(stemsToInclude[stemWeightOrder[i]], i);
}
// Store the results
vsmContext.termDocumentMatrix = tdMatrix;
vsmContext.stemToRowIndex = stemToRowIndex;