*
* @return the set of rows that were selected
*/
public DoubleVector[] chooseSeeds(Matrix dataPoints, int k, int[] weights) {
IntSet selected = new TroveIntSet();
int rows = dataPoints.rows();
// Edge case for where the user has requested more seeds than are
// available. In this case, just return indices for all the rows
if (rows <= k) {
DoubleVector[] arr = new DoubleVector[rows];
for (int i = 0; i < rows; ++i)
arr[i] = dataPoints.getRowVector(i);
return arr;
}
// This array keeps the relative probability of that index's data point
// being selected as a centroid. Although the probabilities change with
// each center added, the array is only allocated once and is refilled
// using determineProbabilities() method.
double[] probabilities = new double[rows];
// This array keeps the memoized computation of the maximum similarity
// of each data point i, to any center currently in selected. After the
// first two points are selected, each iteration updates this array with
// the maximum simiarlity of the new center to that point's index.
double[] inverseSimilarities = new double[rows];
// Pick the first two centers, x, y, with probability proportional to
// 1/sim(x, y). In the original paper the probility is proportional to
// ||x - y||^2, which is the square of the distance between the two
// points. However, since we use the simiarlity (which is conceptually
// the inverse of distance), we use the inverse similarity so that
// elements that are more similarity (i.e., larger values) have smaller
// probabilities.
IntPair firstTwoCenters =
pickFirstTwo(dataPoints, simFunc, weights, inverseSimilarities);
selected.add(firstTwoCenters.x);
selected.add(firstTwoCenters.y);
// For the remaining k-2 points to select, pick a random point, x, with
// probability min(1/sim(x, c_i)) for all centers c_i in selected.
// Again, this probability-based selection is updated from the original
// ORSS paper, which used || x - c_i ||^2 for all centers c. See the
// comment above for the reasoning.
for (int i = 2; i < k; i++) {
// First, calculate the probabilities for selecting each point given
// its similarity to any of the currently selected centers
determineProbabilities(inverseSimilarities, weights,
probabilities, selected);
// Then sample a point from the multinomial distribution over the
// remaining points in dataPoints
int point = selectWithProb(probabilities);
// Once we've selected a point, add it the set that we will return
// and update the similarity all other non-selected points relative
// to be the highest similarity to any selected point
boolean added = selected.add(point);
assert added : "Added duplicate row to the set of selected points";
updateNearestCenter(inverseSimilarities, dataPoints,
point, simFunc);
}
IntIterator iter = selected.iterator();
DoubleVector[] centroids = new DoubleVector[k];
for (int i = 0; iter.hasNext(); ++i)
centroids[i] = dataPoints.getRowVector(iter.nextInt());
return centroids;
}