CandidateCluster fac = facilities.get(j);
veryVerbose(LOGGER, "Facility %d had a center of mass at %s",
j, fac.centerOfMass());
int clusterId = j;
IntIterator iter = fac.indices().iterator();
while (iter.hasNext()) {
int row = iter.nextInt();
assignments[row] =
new HardAssignment(clusterId);
}
}
return new Assignments(numClusters, assignments, matrix);
}
else {
verbose(LOGGER, "Had more than %d facilities, " +
"consolidating to %d", facilities.size(),
numClusters);
List<DoubleVector> facilityCentroids =
new ArrayList<DoubleVector>(facilities.size());
int[] weights = new int[facilities.size()];
int i = 0;
for (CandidateCluster fac : facilities) {
facilityCentroids.add(fac.centerOfMass());
weights[i++] = fac.size();
}
// Wrap the facilities centroids in a matrix for convenience
Matrix m = Matrices.asMatrix(facilityCentroids);
// Select the initial seed points for reducing the kappa
// clusters to k using the generalized ORSS selection
// process, which supports data comparisons other than
// Euclidean distance
GeneralizedOrssSeed orss = new GeneralizedOrssSeed(simFunc);
DoubleVector[] centroids = orss.chooseSeeds(numClusters, m);
assert nonNullCentroids(centroids)
: "ORSS seed returned too few centroids";
// This records the assignments of the kappa facilities to
// the k centers. Initially, everyhting is assigned to the
// same center and iterations repeat until convergence.
int[] facilityAssignments = new int[facilities.size()];
// Using those facilities as starting points, run k-means on
// the facility centroids until no facilities change their
// memebership.
int numChanged = 0;
int kmeansIters = 0;
do {
numChanged = 0;
// Recompute the new centroids each time
DoubleVector[] updatedCentroids =
new DoubleVector[numClusters];
for (i = 0; i < updatedCentroids.length; ++i)
updatedCentroids[i] = new DenseVector(cols);
int[] updatedCentroidSizes = new int[numClusters];
double similaritySum = 0;
// For each CandidateCluster find the most similar centroid
i = 0;
for (CandidateCluster fac : facilities) {
int mostSim = -1;
double highestSim = -1;
for (int j = 0; j < centroids.length; ++j) {
// System.out.printf("centroids[%d]: %s%n fac.centroid(): %s%n",
// j, centroids[j],
// fac.centerOfMass());
double sim = simFunc.sim(centroids[j],
fac.centerOfMass());
if (sim > highestSim) {
highestSim = sim;
mostSim = j;
}
}
// For the most similar centroid, update its center
// of mass for the next round with the weighted
// vector
VectorMath.add(updatedCentroids[mostSim],
fac.sum());
updatedCentroidSizes[mostSim] += fac.size();
int curAssignment = facilityAssignments[i];
facilityAssignments[i] = mostSim;
similaritySum += highestSim;
if (curAssignment != mostSim) {
veryVerbose(LOGGER, "Facility %d changed its " +
"centroid from %d to %d",
i, curAssignment, mostSim);
numChanged++;
}
i++;
}
// Once all the facilities have been assigned to one of
// the k-centroids, recompute the centroids by
// normalizing the sum of the weighted vectors according
// the number of points
for (int j = 0; j < updatedCentroids.length; ++j) {
DoubleVector v = updatedCentroids[j];
int size = updatedCentroidSizes[j];
for (int k = 0; k < cols; ++k)
v.set(k, v.get(k) / size);
// Update this centroid for the next round
centroids[j] = v;
}
veryVerbose(LOGGER, "%d centroids swapped their facility",
numChanged);
} while (numChanged > 0 &&
++kmeansIters < MAX_BATCH_KMEANS_ITERS);
// Use the final assignments to create assignments for each
// of the input data points
Assignment[] assignments = new Assignment[rows];
for (int j = 0; j < facilityAssignments.length; ++j) {
CandidateCluster fac = facilities.get(j);
veryVerbose(LOGGER, "Facility %d had a center of mass at %s",
j, fac.centerOfMass());
int clusterId = facilityAssignments[j];
IntIterator iter = fac.indices().iterator();
while (iter.hasNext()) {
int row = iter.nextInt();
assignments[row] =
new HardAssignment(clusterId);
}
}
return new Assignments(numClusters, assignments, matrix);