Source Code of edu.ucla.sspace.clustering.BisectingKMeans

/*
 * Copyright 2010 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.clustering;


import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.Matrices;


import edu.ucla.sspace.vector.DoubleVector;


import java.util.ArrayList;
import java.util.List;
import java.util.Properties;


import java.util.logging.Logger;




/**
 * An implementation of the Bisecting K-Means algorithm, also known as Repeated
 * Bisections.  This implementation is based on the following paper:
 *
 *   <li style="font-family:Garamond, Georgia, serif">Michael Steinbach,
 *   George Karypis, Vipin Kumar.  "A comparison of document clustering
 *   techniques," in <i>KDD Workshop on Text Mining</i>, 200</li>
 *
 * This clustering algorithm improves upon the standard K-Means algorithm by
 * taking a data set and repeatedly splitting the data points into two regions.
 * Initially all data points are separated into two clusters.  Then, until the
 * desired number of clusters are created, the largest cluster is divided using
 * K-Means with K equal to 2.  This implementation relies on the {@link
 * DirectClustering} implementation.  Any properties passed to this clustering
 * method are passed onto the {@link DirectClustering} algorithm, allowing the
 * user to set the desired seeding method.
 *
 * @see KMeansClustering
 * 
 * @author Keith Stevens
 */
public class BisectingKMeans implements Clustering {


    /**
     * Not implemented.
     */
    public Assignments cluster(Matrix dataPoints, Properties props) {
        throw new UnsupportedOperationException(
                "KMeansClustering requires that the " +
                "number of clusters be specified");
    }


    /**
     * {@inheritDoc}
     */
    public Assignments cluster(Matrix dataPoints,
                               int numClusters,
                               Properties props) {
        // Handle a simple base case.
        if (numClusters <= 1) {
            Assignment[] assignments = new Assignment[dataPoints.rows()];
            for (int i = 0; i < assignments.length; ++i)
                assignments[i] = new HardAssignment(0);
            return new Assignments(numClusters, assignments, dataPoints);
        }


        // Create a count of cluster assignments.
        int[] numAssignments = new int[numClusters];


        // Create a list of lists.  The inner list represents the vectors
        // assigned to a particular cluster.  We use this method so that we can
        // easily transform the cluster to a Matrix
        List<List<DoubleVector>> clusters = new ArrayList<List<DoubleVector>>(
                numClusters);
        for (int c = 0; c < numClusters; ++c)
            clusters.add(new ArrayList<DoubleVector>());


        Clustering clustering = new DirectClustering();
        // Make the first bisection.
        Assignment[] assignments =
            clustering.cluster(dataPoints, 2, props).assignments();


        // Count the number of assignments made to each cluster and move the
        // vectors in to the corresponding list.
        for (int i = 0; i < assignments.length; ++i) {
            int assignment = assignments[i].assignments()[0];
            numAssignments[assignment]++;
            clusters.get(assignment).add(dataPoints.getRowVector(i));
        }


        // Generate the numClusters - 2 clusters by finding the largest cluster
        // and bisecting it.  Of the 2 resulting clusters, one will maintain the
        // same cluster index and the other will be given a new cluster index,
        // namely k, the current cluster number.
        for (int k = 2; k < numClusters; k++) {
            // Find the largest cluster.
            int largestSize = 0;
            int largestIndex = 0;
            for (int c = 0; c < numClusters; ++c) {
                if (numAssignments[c] > largestSize) {
                    largestSize = numAssignments[c];
                    largestIndex = c;
                }
            }


            // Get the list of vectors representing the cluster being split and
            // the cluster that will hold the vectors split off from this
            // cluster.
            List<DoubleVector> originalCluster = clusters.get(largestIndex);
            List<DoubleVector> newCluster = clusters.get(k);


            // Split the largest cluster.
            Matrix clusterToSplit = Matrices.asMatrix(originalCluster);
            Assignment[] newAssignments = 
                clustering.cluster(clusterToSplit, 2, props).assignments();


            // Clear the lists for cluster being split and the new cluster.
            // Also clear the number of assignments.
            originalCluster.clear();
            newCluster.clear();
            numAssignments[largestIndex] = 0;
            numAssignments[k] = 0;


            // Reassign data points in the largest cluster.  Data points
            // assigned to the 0 cluster maintain their cluster number in the
            // real assignment list.  Data points assigned to cluster 1 get the
            // new cluster number, k.  
            for (int i = 0, j = 0; i < dataPoints.rows(); ++i) {
                if (assignments[i].assignments()[0] == largestIndex) {
                    // Make the assignment for vectors that keep their
                    // assignment.
                    if (newAssignments[j].assignments()[0] == 0) {
                        originalCluster.add(dataPoints.getRowVector(i));
                        numAssignments[largestIndex]++;
                    }
                    // Make the assignment for vectors that have changed their
                    // assignment.
                    else {
                        newCluster.add(dataPoints.getRowVector(i));
                        assignments[i] = new HardAssignment(k);
                        numAssignments[k]++;
                    }
                    j++;
                }
            }
        }
        return new Assignments(numClusters, assignments, dataPoints);
    }


    public String toString() {
        return "BisectingKMeans";
    }
}
Source Code of edu.ucla.sspace.clustering.BisectingKMeans

Related Classes of edu.ucla.sspace.clustering.BisectingKMeans