Source Code of edu.ucla.sspace.clustering.SpectralClustering$ClusterResult

/*
 * Copyright 2009 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.clustering;


import edu.ucla.sspace.matrix.Matrices;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseMatrix;


import edu.ucla.sspace.util.Generator;


import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.ScaledDoubleVector;
import edu.ucla.sspace.vector.ScaledSparseDoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.Vectors;
import edu.ucla.sspace.vector.VectorMath;


import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;


import java.util.logging.Logger;




/**
 * Implementation of Spectral Clustering using divide and merge methodology.
 * The implementation is based on two papers:
 *
 * <ul>
 *   <li style="font-family:Garamond, Georgia, serif">Cheng, D., Kannan, R.,
 *     Vempala, S., Wang, G.  (2006).  A Divide-and-Merge Methodology for
 *     Clustering. <i>ACM Transactions on Database Systsms</i>, <b>31</b>,
 *     1499-1525.  Available <a
 *     href=http://www-math.mit.edu/~vempala/papers/eigencluster.pdf">here</a>
 *   </li>
 *
 *   <li style="font-family:Garamond, Georgia, serif">Kannan, R., Vempala, S.,
 *     Vetta, A.  (2000).  On clustering: Good, bad, and spectral.  
 *     <i>FOCS '00: Proceedings of the 41st Annual Symposium on Foundations of
 *   Computer Science</i> Available <a
 *     href="http://www-math.mit.edu/~vempala/papers/specfocs.ps">here</a>
 *   </li>
 * </ul>
 *
 * This class utilizes a {@link EigenCut} instance to compute a spectral
 * partition of a dataset.  Given a single data set, it will recursively
 * partition that data set using a {@link EigenCut} instance until one of two
 * limits: all data points are in a unique cluster, or the maximum number of
 * partitions have been made, as set by the number of clusters.  Once all
 * paritions are made, neighboring paritions may be merged according to the
 * objective scores returned by the {@link EigenCut} instance.  If {@link
 * cluster(Matrix, int)} is used, the requested number of clusters,
 * or fewer, will be returned, otherwise the algorithm will decide on the
 * correct number of clusters.
 *
 * @see EigenCut
 * @see BaseSpectralCut
 *
 * @author Keith Stevens
 */
public class SpectralClustering {


    /**
     * The logger used to record all output.
     */
    private static final Logger LOGGER =
        Logger.getLogger(SpectralClustering.class.getName());


    /**
     * The amount of weight given to inter-cluster similarity when using the
     * relaxed correlation objective function.
     */
    private final double alpha;


    /**
     * {@code 1 - alpha}, the weight given to intra-cluster dis-similarity.
     */
    private final double beta;


    /**
     * A generator function that creates new fresh instances of a {@link
     * EigenCut} implementation on demand.  This is used to reduce the
     * reflection overhead, as many {@link EigenCut} instances may be needed.
     */
    private final Generator<EigenCut> cutterGenerator;


    /**
     * Creates a new {@link SpectralClustering} instance.
     *
     * @param alpha The weight given to inter cluster similarity for the relaxed
     *        correlation objective function.  {@code beta} will be set to
     *        {@link 1 - alpha}.
     * @param cutterGenerator A {@link Generator} of {@link EigenCut} instances.
     */
    public SpectralClustering(double alpha,
                              Generator<EigenCut> cutterGenerator) {
        this.alpha = alpha;
        this.beta = 1 - alpha;
        this.cutterGenerator = cutterGenerator;
    }


    /**
     * Returns the Cluster {@link Assignments} of each data point in {@link
     * matrix}.  This method will determine a "good" number of clusters
     * algorithmiclly.  This method also uses the relaxed correlation objective
     * function, as the k-means objective function would always keep each data
     * point in it's own cluster.
     */
    public Assignments cluster(Matrix matrix) {
        ClusterResult r = fullCluster(scaleMatrix(matrix), 0);
        verbose("Created " + r.numClusters + " clusters");


        Assignment[] assignments = new HardAssignment[r.assignments.length];
        for (int i = 0; i < r.assignments.length; ++i)
            assignments[i] = new HardAssignment(r.assignments[i]);


        return new Assignments(r.numClusters, assignments, matrix);
    }


    /**
     * Returns the Cluster {@ link Assignments} for each data point in {@link
     * matrix}.  This method will return at most {@code maxClusters}.  If {@code
     * useKMeans} is true, the K-means objective function is used to merge
     * clusters, otherwise the relaxed correlation function is used.
     */
    public Assignments cluster(Matrix matrix,
                                int maxClusters,
                                boolean useKMeans) {
        // Cluster the matrix recursively.
        LimitedResult[] results = limitedCluster(
                scaleMatrix(matrix), maxClusters, useKMeans);


        // If the matrix could not be cut, there will be only one result
        // returned, so check for that use case.
        LimitedResult r = (results.length == 1)
            ? results[0]
            : results[maxClusters-1];


        // Sometimes the desired number of clusters won't be computed, so find
        // the next largest clustering result and return that instead.
        for (int i = maxClusters -2; r == null && i >= 0; i--)
            r = results[i];


        // Convert the LimitedResult object into an Assignments object.
        verbose("Created " + r.numClusters + " clusters");
        Assignment[] assignments = new HardAssignment[r.assignments.length];
        for (int i = 0; i < r.assignments.length; ++i)
            assignments[i] = new HardAssignment(r.assignments[i]);


        return new Assignments(r.numClusters, assignments, matrix);
    }


    /**
     * Returns a scaled {@link Matrix}, where each row is the unit
     * magnitude version of the corresponding row vector in {@link matrix}.
     * This is required so that dot product computations over a large number of
     * data points can be distributed, wherease the cosine similarity cannot be.
     */
    private Matrix scaleMatrix(Matrix matrix) {
        // Scale every data point such that it has a dot product of 1 with
        // itself.  This will make further calculations easier since the dot
        // product distrubutes when the cosine similarity does not.
        if (matrix instanceof SparseMatrix) {
            List<SparseDoubleVector> scaledVectors =
                new ArrayList<SparseDoubleVector>(matrix.rows());
            SparseMatrix sm = (SparseMatrix) matrix;
            for (int r = 0; r < matrix.rows(); ++r) {
                SparseDoubleVector v = sm.getRowVector(r);
                scaledVectors.add(new ScaledSparseDoubleVector(
                            v, 1/v.magnitude()));
            }
            return Matrices.asSparseMatrix(scaledVectors);
        } else {
            List<DoubleVector> scaledVectors = 
                new ArrayList<DoubleVector>(matrix.rows());
            for (int r = 0; r < matrix.rows(); ++r) {
                DoubleVector v = matrix.getRowVector(r);
                scaledVectors.add(new ScaledDoubleVector(v, 1/v.magnitude()));
            }
            return Matrices.asMatrix(scaledVectors);
        }
    }


    /**
     * Returns a {@link ClusterResult} when {@link matrix} is spectrally
     * clustered at a given {@link depth}.  This will recursively call itself
     * until the number of rows in {@code matrix} is less than or equal to 1.
     */
    private ClusterResult fullCluster(Matrix matrix,
                                      final int depth) {
        verbose("Clustering at depth " + depth);


        // If the matrix has only one element or the depth is equal to the
        // maximum number of desired clusters then all items are in a single
        // cluster.
        if (matrix.rows() <= 1) 
            return new ClusterResult(new int[matrix.rows()], 1);


        // Get a fresh new eigen cutter and compute the specral cut of the
        // matrix.
        EigenCut eigenCutter = cutterGenerator.generate();
        eigenCutter.computeCut(matrix);


        final Matrix leftMatrix = eigenCutter.getLeftCut();
        final Matrix rightMatrix = eigenCutter.getRightCut();


        verbose(String.format("Splitting into two matricies %d-%d",
                              leftMatrix.rows(), rightMatrix.rows()));


        // If the compute decided that the matrix should not be split, short
        // circuit any attempts to further cut the matrix.
        if (leftMatrix.rows() == matrix.rows() ||
            rightMatrix.rows() == matrix.rows())
            return new ClusterResult(new int[matrix.rows()], 1);




        // Do clustering on the left and right branches.  We can do this in
        // parallel because all of the data members in this class are thread
        // safe, since each call to fullCluster uses a new instance of a
        // EigenCutter which has all of the state for a particular partition.
        final ClusterResult[] results = new ClusterResult[2];
        results[0] = fullCluster(leftMatrix, depth+1);
        results[1] = fullCluster(rightMatrix, depth+1);


        ClusterResult leftResult = results[0];
        ClusterResult rightResult = results[1];


        verbose("Merging at depth " + depth);


        // Compute the relaxed correlation objective function over the split
        // partitions found so far.
        double splitObjective = eigenCutter.getSplitObjective(
                alpha, beta,
                leftResult.numClusters, leftResult.assignments,
                rightResult.numClusters, rightResult.assignments);


        // Compute the objective when we merge the two branches together.
        double mergedObjective = eigenCutter.getMergedObjective(alpha, beta);


        // If the merged objective value is less than the split version, combine
        // all clusters into one.
        int[] assignments = new int[matrix.rows()];
        int numClusters = 1;
        if (mergedObjective < splitObjective) {
            verbose("Selecting to combine sub trees at depth " + depth);
            Arrays.fill(assignments, 0);
        } else  {
            verbose("Selecting to maintain sub trees at depth " + depth);


            // Copy over the left assignments and the right assignments, where
            // the cluster id's of the right assignments are incremented to
            // avoid duplicate cluster ids.
            numClusters = leftResult.numClusters + rightResult.numClusters;


            int[] leftReordering = eigenCutter.getLeftReordering();
            int[] rightReordering = eigenCutter.getRightReordering();


            for (int index = 0; index < leftReordering.length; ++index)
                assignments[leftReordering[index]] = 
                    leftResult.assignments[index];
            for (int index = 0; index < rightReordering.length; ++index)
                assignments[rightReordering[index]] =
                    rightResult.assignments[index] + leftResult.numClusters;
        }
        return new ClusterResult(assignments, numClusters);
    }


    /**
     * Returns {@code maxClusters} {@link LimitedResult}s using spectral
     * clustering with the relaxed correlation objective function.
     */
    private LimitedResult[] limitedCluster(Matrix matrix,
                                           int maxClusters) {
        return limitedCluster(matrix, maxClusters, false);
    }


    /**
     * Returns {@code maxClusters} {@link LimitedResult}s using spectral
     * clustering.  If {@code useKMeans}, the k-means objective function is used
     * for merging, other wise the relaxed correlation objective function is
     * used.
     */
    private LimitedResult[] limitedCluster(Matrix matrix,
                                           final int maxClusters,
                                           final boolean useKMeans) {
        verbose("Clustering for " + maxClusters + " clusters.");


        // Get a fresh new EigenCut first so that we can compute the RhoSum of
        // the matrix, which is useful when computing the objective function for
        // the merged result.
        EigenCut eigenCutter = cutterGenerator.generate();


        // If the matrix has only one element or the depth is equal to the
        // maximum number of desired clusters then all items are in a single
        // cluster.
        if (matrix.rows() <= 1 || maxClusters <= 1) {
            eigenCutter.computeRhoSum(matrix);
            LimitedResult result;
            if (useKMeans)
                result = new KMeansLimitedResult(new int[matrix.rows()], 1,
                        eigenCutter.getKMeansObjective());
            else
                // When computing the inter-cluster similarity for the full
                // matrix, this is simply rhowSum with out any self-similarity
                // scores.
                result = new SpectralLimitedResult(new int[matrix.rows()], 1, 
                        eigenCutter.getMergedObjective(alpha, beta),
                        eigenCutter.rhoSum() - matrix.rows() / 2.0,
                        (matrix.rows() * (matrix.rows()-1)) / 2);
            return new LimitedResult[] {result};
        }


        // Get a fresh new eigen cutter and compute the specral cut of the
        // matrix.
        eigenCutter.computeCut(matrix);


        final Matrix leftMatrix = eigenCutter.getLeftCut();
        final Matrix rightMatrix = eigenCutter.getRightCut();


        // If the compute decided that the matrix should not be split, short
        // circuit any attempts to further cut the matrix.
        if (leftMatrix.rows() == matrix.rows() ||
            rightMatrix.rows() == matrix.rows()) {
            eigenCutter.computeRhoSum(matrix);
            LimitedResult result;
            if (useKMeans)
                result = new KMeansLimitedResult(new int[matrix.rows()], 1,
                        eigenCutter.getKMeansObjective());
            else
                // When computing the inter-cluster similarity for the full
                // matrix, this is simply rhowSum with out any self-similarity
                // scores.
                result = new SpectralLimitedResult(new int[matrix.rows()], 1, 
                        eigenCutter.getMergedObjective(alpha, beta),
                        eigenCutter.rhoSum() - matrix.rows() / 2.0,
                        (matrix.rows() * (matrix.rows()-1)) / 2);
            return new LimitedResult[] {result};
        }


        verbose(String.format("Splitting into two matricies %d-%d",
                              leftMatrix.rows(), rightMatrix.rows()));


        // Do clustering on the left and right branches.  We can do this in
        // parallel because all of the data members in this class are thread
        // safe, since each call to fullCluster uses a new instance of a
        // EigenCutter which has all of the state for a particular partition.
        final LimitedResult[][] subResults = new LimitedResult[2][];
        subResults[0] = limitedCluster(leftMatrix, maxClusters-1, useKMeans);
        subResults[1] = limitedCluster(rightMatrix, maxClusters-1, useKMeans);


        LimitedResult[] leftResults = subResults[0];
        LimitedResult[] rightResults = subResults[1];


        verbose("Merging at for: " + maxClusters + " clusters");


        // Get the re-ordering mapping for each partition.   Using this
        // re-ordering, compute objective function for the entire data matrix.
        int[] leftReordering = eigenCutter.getLeftReordering();
        int[] rightReordering = eigenCutter.getRightReordering();


        LimitedResult[] results = new LimitedResult[maxClusters];
        if (useKMeans)
            results[0] = new KMeansLimitedResult(new int[matrix.rows()], 1,
                    eigenCutter.getKMeansObjective());
        else
            // When computing the inter-cluster similarity for the full
            // matrix, this is simply rhowSum with out any self-similarity
            // scores.
            results[0] = new SpectralLimitedResult(new int[matrix.rows()], 1, 
                    eigenCutter.getMergedObjective(alpha, beta),
                    eigenCutter.rhoSum() - matrix.rows() / 2.0,
                    (matrix.rows() * (matrix.rows()-1)) / 2);


        // Each LimitedResult is ordered based on the number of clusters found.
        // Iterate through each clustering result computed for each partition
        // and find combined set of cluter assignments that satisfies the
        // requested number of clusters.
        for (int i = 0; i < leftResults.length; ++i) {
            LimitedResult leftResult = leftResults[i];


            // If there are no assignments found for this number of clusters,
            // skip it.
            if (leftResult == null)
                continue;


            for (int j = 0; j < rightResults.length; ++j) {
                LimitedResult rightResult = rightResults[j];


                // If there are no assignments found for this number of
                // clusters, skip it.
                if (rightResult == null)
                    continue;


                // Compute the number of clusters found when using both the left
                // and right parititons.
                int numClusters =
                    leftResult.numClusters + rightResult.numClusters - 1;
                if (numClusters >= results.length)
                    continue;


                // Compute the combined objective function when using both the
                // left and right partition results.
                LimitedResult newResult;
                if (useKMeans)
                    newResult = leftResult.combine(leftResult, rightResult,
                            leftReordering, rightReordering, 0);
                else
                    newResult = leftResult.combine(
                            leftResult, rightResult,
                            leftReordering, rightReordering,
                            eigenCutter.rhoSum());


                // If no assignments have been made so far for this number of
                // clusters, or the found score is better than the old score,
                // store the combined LimitedResult as the best result for the
                // number of clusters.
                if (results[numClusters] == null ||
                    results[numClusters].compareTo(newResult) < 0)
                    results[numClusters] = newResult;
            }
        }


        return results;
    }






    /**
     * Logs data to info.
     */
    private void verbose(String out) {
        LOGGER.info(out);
    }


    /**
     * A internal helper class that can combine cluster results from two
     * separate partions using a particular objective function.
     */
    private abstract class LimitedResult {
        
        /**
         * The data assignments for a single {@link LimitedResult}.
         */
        public int[] assignments;


        /**
         * The number of clusters.
         */
        public int numClusters;


        /**
         * Creates a new {@link LimitedResult} using the given assignments and
         * number of clusters.
         */
        public LimitedResult(int[] assignments, int numClusters) {
            this.assignments = assignments;
            this.numClusters = numClusters;
        }


        /**
         * Combines the assignments made between two {@link LimitedResult}s.
         * The cluster ids in {@code res2} will be increased based on the number
         * of clusters in {@code res1}.
         */
        int[] combineAssignments(LimitedResult res1, LimitedResult res2,
                                 int[] ordering1, int[] ordering2) {
            int[] newAssignments = new int[
                res1.assignments.length + res2.assignments.length];


            for (int k = 0; k < ordering1.length; ++k)
                newAssignments[ordering1[k]] = res1.assignments[k];
            for (int k = 0; k < ordering2.length; ++k)
                newAssignments[ordering2[k]] =
                    res2.assignments[k] + res1.numClusters;
            return newAssignments;
        }


        /**
         * Returns the objective function score for this result.
         */
        abstract double score();


        /**
         * Returns greater than 0 when {@code this} {@link LimitedResult} should
         * be selected over {@code other}.
         */
        abstract double compareTo(LimitedResult other);


        /**
         * Returns a combined {@link LimitedResult} based on two existing {@link
         * LimitedResult}s.
         */
        abstract LimitedResult combine(LimitedResult res1, LimitedResult res2,
                                       int[] ordering1, int[] ordering2,
                                       double extra);
    }


    /**
     * Computes the k-means objective when combining two {@link LimitedResult}s.
     */
    private class KMeansLimitedResult extends LimitedResult {
        
        /**
         * The k-means objective score for this result.
         */
        public double score;


        /**
         * Creates a new {@link KMeansLimitedResult} with a given objective
         * score.
         */
        public KMeansLimitedResult(int[] assignments,
                                  int numClusters,
                                  double score) {
            super(assignments, numClusters);
            this.score = score;
        }


        /**
         * Returns a new {@link KMeansLimitedResult} with an update scored based
         * on the scores of {@code res1} and {@code res2}.
         */
        LimitedResult combine(LimitedResult res1, LimitedResult res2,
                              int[] ordering1, int[] ordering2,
                              double extra) {
            KMeansLimitedResult kres1 = (KMeansLimitedResult) res1;
            KMeansLimitedResult kres2 = (KMeansLimitedResult) res2;


            int[] newAssignments = combineAssignments(
                    res1, res2, ordering1, ordering2);
            int newNumClusters = res1.numClusters + res2.numClusters;
            double newScore = kres1.score + kres2.score;
            return new KMeansLimitedResult(
                    newAssignments, newNumClusters, newScore);
        }


        /**
         * Returns the k-means objective score.
         */
        public double score() {
            return score;
        }


        /**
         * Returns greater than 0 when this score is greater than other's score.
         */
        public double compareTo(LimitedResult other) {
            return this.score() - other.score();
        }
    }


    /**
     * Computes the relaxed correlation objective function when combining two
     * {@link LimitedResult}s.
     */
    private class SpectralLimitedResult extends LimitedResult {


        /**
         * The total inter-cluster similarity.
         */
        public double totalScore;


        /**
         * The full intra-cluster similarity.
         */
        public double rawInterScore;


        /**
         * The number of intra-cluster similarity comparisons.
         */
        public int interCount;


        /**
         * Constructs a new {@link SpectralLimitedResult}.
         */
        public SpectralLimitedResult(int[] assignments, int numClusters,
                                     double totalScore, double rawInterScore,
                                     int interCount) {
            super(assignments, numClusters);


            this.totalScore = totalScore;
            this.rawInterScore = rawInterScore;
            this.interCount = interCount;
        }


        /**
         * Returns {@link totalScore}.
         */
        public double score() {
            return totalScore;
        }


        /**
         * Returns greater than 0 when other's score is greater than this score.
         */
        public double compareTo(LimitedResult other) {
            return other.score() - this.score();
        }


        /**
         * Returns the combined {@link LimitedResult} when using the
         * relaxed correlation objective function. 
         */ 
        LimitedResult combine(LimitedResult res1, LimitedResult res2, 
                              int[] ordering1, int[] ordering2, double rhoSum) {
            // Assume that both are SpectralLimitedResults.
            SpectralLimitedResult sres1 = (SpectralLimitedResult) res1;
            SpectralLimitedResult sres2 = (SpectralLimitedResult) res2;


            // Get the raw combined assignments and the number of clusters.
            int[] newAssignments = combineAssignments(
                    res1, res2, ordering1, ordering2);
            int newNumClusters = res1.numClusters + res2.numClusters;


            // Use the raw inter-cluster similarity to compute the new
            // inter-cluster similarity and the number of inter-cluster
            // similarity computations.
            double newInterScore = sres1.rawInterScore + sres2.rawInterScore;
            int newCount = sres1.interCount + sres2.interCount;


            // We can compute the intra cluster scores by taking subtracting out
            // any self-similarity scores, any duplicate similarity scores, and
            // the inter-cluster similarity scores from rhoSum.  This leaves
            // only the intra-cluster similarity scores behind.
            double intraClusterScore =
                (rhoSum-newAssignments.length) / 2.0 - newInterScore;
            newInterScore = newCount - newInterScore;
            double newScore = alpha * newInterScore + beta * intraClusterScore;


            return new SpectralLimitedResult(newAssignments, newNumClusters,
                                            newScore, newInterScore, newCount);
        }
    }


    /**
     * A simple struct holding the cluster assignments and the number of
     * unique clusters generated.
     */
    private class ClusterResult {


        public int[] assignments;
        public int numClusters;


        public ClusterResult(int[] assignments, int numClusters) {
            this.assignments = assignments;
            this.numClusters = numClusters;
        }
    }
}
Source Code of edu.ucla.sspace.clustering.SpectralClustering$ClusterResult

Related Classes of edu.ucla.sspace.clustering.SpectralClustering$ClusterResult