double deltaPoints = (double) instances.size();
int iterations = 0;
SparseVector clusterMean;
for (int c = 0; c < numClusters; c++) {
instanceClusters.add(c, new InstanceList(instancePipe));
}
logger.info("Entering KMeans iteration");
while (deltaMeans > MEANS_TOLERANCE && iterations < MAX_ITER
&& deltaPoints > instances.size() * POINTS_TOLERANCE) {
iterations++;
deltaPoints = 0;
// For each instance, measure its distance to the current cluster
// means, and subsequently assign it to the closest cluster
// by adding it to an corresponding instance list
// The mean of each cluster InstanceList is then updated.
for (int n = 0; n < instances.size(); n++) {
instClust = 0;
instClustDist = Double.MAX_VALUE;
for (int c = 0; c < numClusters; c++) {
instDist = metric.distance(clusterMeans.get(c),
(SparseVector) instances.get(n).getData());
if (instDist < instClustDist) {
instClust = c;
instClustDist = instDist;
}
}
// Add to closest cluster & label it such
instanceClusters.get(instClust).add(instances.get(n));
if (clusterLabels[n] != instClust) {
clusterLabels[n] = instClust;
deltaPoints++;
}
}
deltaMeans = 0;
for (int c = 0; c < numClusters; c++) {
if (instanceClusters.get(c).size() > 0) {
clusterMean = VectorStats.mean(instanceClusters.get(c));
deltaMeans += metric.distance(clusterMeans.get(c), clusterMean);
clusterMeans.set(c, clusterMean);
instanceClusters.set(c, new InstanceList(instancePipe));
} else {
logger.info("Empty cluster found.");
switch (emptyAction) {
case EMPTY_ERROR:
return null;
case EMPTY_DROP:
logger.fine("Removing cluster " + c);
clusterMeans.remove(c);
instanceClusters.remove(c);
for (int n = 0; n < instances.size(); n++) {
assert (clusterLabels[n] != c) : "Cluster size is "
+ instanceClusters.get(c).size()
+ "+ yet clusterLabels[n] is " + clusterLabels[n];
if (clusterLabels[n] > c)
clusterLabels[n]--;
}
numClusters--;
c--; // <-- note this trickiness. bad style? maybe.
// it just means now that we've deleted the entry,
// we have to repeat the index to get the next entry.
break;
case EMPTY_SINGLE:
// Get the instance the furthest from any centroid
// and make it a new centroid.
double newCentroidDist = 0;
int newCentroid = 0;
InstanceList cacheList = null;
for (int clusters = 0; clusters < clusterMeans.size(); clusters++) {
SparseVector centroid = clusterMeans.get(clusters);
InstanceList centInstances = instanceClusters.get(clusters);
// Dont't create new empty clusters.
if (centInstances.size() <= 1)
continue;
for (int n = 0; n < centInstances.size(); n++) {
double currentDist = metric.distance(centroid,
(SparseVector) centInstances.get(n).getData());
if (currentDist > newCentroidDist) {
newCentroid = n;
newCentroidDist = currentDist;
cacheList = centInstances;