Package edu.ucla.sspace.clustering.seeding

Source Code of edu.ucla.sspace.clustering.seeding.OrssSeed

/*
* Copyright 2011 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.clustering.seeding;

import edu.ucla.sspace.common.Similarity;

import edu.ucla.sspace.matrix.Matrix;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.ScaledDoubleVector;
import edu.ucla.sspace.vector.VectorMath;


/**
* Select seeds using a modification of the ORSS algorithm.  Seeds are selected
* from the data such that they are well scattered.  This implementation has a
* runtime of O(nkd) and is is based on the following paper:
*
*     <li style="font-family:Garamond, Georgia, serif"> Rafail Ostrovsky ,
*     Yuval Rabani ,  Leonard J. Schulman ,  Chaitanya Swamy, "The
*     effectiveness of lloyd-type methods for the k-means problem," in
*     <i>47th IEEE Symposium on the Foundations of Computer Science</i>,
*     2006</li>
*
* This involves 4 steps:
* <ol>
*   <li>First, the cost of an optimal solution to the K-Means
*       problem is solved for the 1 cluster case along (called
*       optimalSingleCost) with the centroid for this case (called c).</li>
*   <li>Then, an intial seed is selected from the set of data points (X)
*       with
*       probability p((optimalSingleCost + n*euclieanDist(x,c)^2)/2*n*optimalSingleCost) </li>
*   <li>The second centroid is selected from X with probability
*        p(euclideanDist(x, c_1)^2/(optimalSingleCost + n*euclideanDist(c,
*        c_1)^2)) </li>
*   <li>The remaining k-2 centroids are selected from X with probability
*       p(euclideanDist(x, c_nearest)^2)
*   </li>
* </ol>
*
* @author Keith Stevens
*/
public class OrssSeed implements KMeansSeed {

    /**
     * A small number used to determine when the centroids have converged.
     */
    private static final double EPSILON = 1e-3;

    /**
     * {@inheritDoc}
     */
    public DoubleVector[] chooseSeeds(int numCentroids, Matrix dataPoints) {
        int numDataPoints = dataPoints.rows();
        DoubleVector[] centroids = new DoubleVector[numCentroids];

        // Compute the centroid assuming that there is only one cluster to be
        // found.  This is required for computing the optimal cost of a single
        // cluster solution.
        DoubleVector singleCentroid = new DenseVector(dataPoints.columns());
        for (int i = 0; i < numDataPoints; ++i)
            VectorMath.add(singleCentroid, dataPoints.getRowVector(i));
        singleCentroid = new ScaledDoubleVector(
                singleCentroid, 1/((double)numDataPoints));

        // Compute the optimal cost of the single cluster case.
        double optimalSingleCost = 0;
        double[] distances = new double[numDataPoints];
        for (int i = 0; i < numDataPoints; ++i) {
            distances[i] = Similarity.euclideanDistance(
                    singleCentroid, dataPoints.getRowVector(i));
            optimalSingleCost += distances[i];
        }

        // Select the first centroid.  We pick the first centroid based on the
        // probability p((optimalSingleCost + n*euclideanDistance(x, c)^2)/
        //               2*n*optimalSingleCost)
        double probability = Math.random();
        double optimalDistance = 0;
        for (int i = 0; i < numDataPoints; ++i) {
            // Get the distance between each data point and the single
            // centroid.
            double distance = distances[i];
            // Compute the probability of selecting this data point.
            double cost = optimalSingleCost + numDataPoints*distance*distance;
            cost /= (2 * numDataPoints * optimalSingleCost);

            // Determine whether or not we should select the data point based on
            // it's probability.  Also save the distance from this new centroid
            // to the single centroid.
            probability -= cost;
            if (probability <= EPSILON) {
                centroids[0] = dataPoints.getRowVector(i);
                optimalDistance = distance;
                break;
            }
        }

        // Select the second centroid.
        probability = Math.random();
       
        // Precompute the normalizing factor for the probability of selecting a
        // data point.  In short this should be:
        //   optimalSingleCost * n*|euclideanDist(c, c_1)|^2
        double normalFactor =
            optimalSingleCost + numDataPoints*Math.pow(optimalDistance, 2);

        // Store the set minimum distance each data point has to a single
        // centroid.  This is used later on for selecting the last k-2
        // centroids.
        double[] minDistances = new double[numDataPoints];
        boolean selected = false;
        for (int i = 0; i < numDataPoints; ++i) {
            // Compute the distance.  Since the first centroid is the only one,
            // we know its the closest so store the distance.
            double distance = Similarity.euclideanDistance(
                    centroids[0], dataPoints.getRowVector(i));
            minDistances[i] = distance;

            // Determine whether or not we should select this data point as the
            // second centroid.
            double cost = Math.pow(distance, 2) / normalFactor;
            probability -= cost;
            if (probability <= EPSILON && !selected) {
                centroids[1] = dataPoints.getRowVector(i);
                selected = true;
            }
        }

        // Select the remaining k-2 centroids.
        // We pick a data point to be a new centroid based on the following
        // probability:
        //   p(|euclideanDist(dataPoint, nearestCentroid)|^2)
        // Since we store the minimum distance each data point has to any given
        // centroid in minDistances, all we need to do is update that with the
        // distance between each data point and the most recently selected
        // centroid.
        for (int c = 2; c < numCentroids; ++c) {
            selected = false;
            probability = Math.random();
            for (int i = 0; i < numDataPoints; ++i) {
                // Compute the distance between each data point and the most
                // recently selected centroid.  Update the minimum distance
                // array if the recent centroid is nearer.
                double distance = Similarity.euclideanDistance(
                        centroids[c-1], dataPoints.getRowVector(i));
                if (distance < minDistances[c])
                    minDistances[i] = distance;

                // Determine whether or not we should select this data point as
                // the next centroid.
                double cost = Math.pow(minDistances[i], 2);
                probability -= cost;
                if (probability <= EPSILON && !selected) {
                    centroids[c] = dataPoints.getRowVector(i);
                    selected = true;
                }
            }
        }

        return centroids;
    }
}
TOP

Related Classes of edu.ucla.sspace.clustering.seeding.OrssSeed

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.