// not purely alphabetic (i.e. contains number of other symbols), don't
// bother clustering it. This is done to reduce the computation time,
// and to avoid clustering non-meaningful terms such as '.' or '''
if (!(term.matches("[a-zA-z]+") && numClusters > 6)) { // special case
SparseDoubleVector meanSenseVector =
new CompactSparseVector(termToIndex.size());
int rows = contexts.rows();
for (int row = 0; row < rows; ++row)
VectorMath.add(meanSenseVector, contexts.getRowVector(row));
termToVector.put(term, meanSenseVector);
return;
}
Assignments clusterAssignment =
new ClutoClustering().cluster(contexts, numClusters,
ClutoClustering.Method.AGGLOMERATIVE,
ClutoClustering.Criterion.UPGMA);
LOGGER.fine("Generative sense vectors for " + term);
// For each of the clusters, compute the mean sense vector
int[] clusterSize = new int[numClusters];
// Use CompactSparseVector to conserve memory given the potentially
// large number of sense vectors
SparseDoubleVector[] meanSenseVectors =
new CompactSparseVector[numClusters];
for (int i = 0; i < meanSenseVectors.length; ++i)
meanSenseVectors[i] = new CompactSparseVector(termToIndex.size());
// For each of the contexts, determine which cluster it was in and sum
// it value with the other contexts
for (int row = 0; row < clusterAssignment.size(); ++row) {
// Check whether this row was assigned a cluster