Source Code of edu.ucla.sspace.clustering.Assignments

/*
 * Copyright 2011 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.clustering;


import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.ScaledDoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.ScaledSparseDoubleVector;
import edu.ucla.sspace.vector.VectorMath;


import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseMatrix;


import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;


import java.util.logging.Level;
import java.util.logging.Logger;




/**
 * The return value for all {@link Clustering} implementations.  This class
 * records the number of clusters created, the assignments for each value, and
 * helper methods for constructing the centroids of a cluster.
 *
 * @author Keith Stevens
 */
public class Assignments implements Iterable<Assignment> {


    /**
     * The logger to which clustering status updates will be written.
     */
    private static final Logger LOGGER =
        Logger.getLogger(Assignments.class.getName());    


    /**
     * The {@link Assignment}s made for each data point.
     */
    private Assignment[] assignments;


    /**
     * The number of clusters found from a particular algorithm.
     */
    private int numClusters;


    /**
     * The {@link Matrix} of data points that these {@link Assignments} link to.
     */
    private Matrix matrix;


    private int[] counts;


    /**
     * Creates a new {@link Assignments} instance that can hold up to {@code
     * numAssignments} {@link Assignment}s.  This assumes that the data matrix
     * will not be accessible.  Calls to {@link #getCentroids} will fail when
     * using this constructor.
     */
    public Assignments(int numClusters, int numAssignments) {
        this(numClusters, numAssignments, null);
    }


    /**
     * Creates a new {@link Assignments} instance that can hold up to {@code
     * numAssignments} {@link Assignment}s.
     */
    public Assignments(int numClusters, int numAssignments, Matrix matrix) {
        this.numClusters = numClusters;
        this.matrix = matrix;
        assignments = new Assignment[numAssignments];
    }


    /**
     * Creates a new {@link Assignments} instance that takes ownership of the
     * {@code initialAssignments} array.  This assumes that the data matrix will
     * not be accessible.  Calls to {@link #getCentroids} will fail when using
     * this constructor.
     */
    public Assignments(int numClusters,
                       Assignment[] initialAssignments) {
        this(numClusters, initialAssignments, null);
    }


    /**
     * Creates a new {@link Assignments} instance that takes ownership of the
     * {@code initialAssignments} array.
     */
    public Assignments(int numClusters,
                       Assignment[] initialAssignments, 
                       Matrix matrix) {
        this.numClusters = numClusters;
        this.matrix = matrix;
        assignments = initialAssignments;
    }


    /**
     * Sets {@link Assignment} {@code i} to have value {@code assignment}.
     */
    public void set(int i, Assignment assignment) {
        assignments[i] = assignment;
    }


    /**
     * Returns the number of {@link Assignment} objects stored.
     */
    public int size() {
        return assignments.length;
    }


    /**
     * Returns an iterator over the {@link Assignment} objects stored.
     */
    public Iterator<Assignment> iterator() {
        return Arrays.asList(assignments).iterator();
    }


    /**
     * Returns the {@link Assignment} object at index {@code i}.
     */
    public Assignment get(int i) {
        return assignments[i];
    }


    /**
     * Returns the number of clusters.
     */
    public int numClusters() {
        return numClusters;
    }


    /**
     * Returns the array of {@link Assignment} objects.
     */
    public Assignment[] assignments() {
        return assignments;
    }


    /**
     * Returns the data point indices assigned to each cluster.  Note that if
     * the underlying clustering algorithm does not put some items in a cluster
     * (i.e., their cluster assignment is negative) then these items will not be
     * returned as a part of any cluster.
     */
    public List<Set<Integer>> clusters() {
        List<Set<Integer>> clusters = new ArrayList<Set<Integer>>();
        for (int c = 0; c < numClusters; ++c)
            clusters.add(new HashSet<Integer>());
        for (int i = 0; i < assignments.length; ++i)
            for (int k : assignments[i].assignments()) {
                // Check that the item was place in a cluster
                if (k >= 0)
                    clusters.get(k).add(i);
            }
        return clusters;
    }


    /**
     * Returns an array of dense centroid vectors of each discovered cluster
     * which are scaled according the the number of data points asisgned to that
     * cluster.  Note that this method assumes that the original {@link Matrix}
     * holding the data points contains rows of feature vectors.  
     */
    public DoubleVector[] getCentroids() {
        if (matrix == null)
            throw new IllegalArgumentException(
                    "The data matrix was not passed to Assignments.");


        // Initialzie the centroid vectors and the cluster sizes.
        DoubleVector[] centroids = new DoubleVector[numClusters];
        counts = new int[numClusters];
        for (int c = 0; c < numClusters; ++c)
            centroids[c] = new DenseVector(matrix.columns());


        // For each initial assignment, add the vector to it's centroid and
        // increase the size of the cluster.
        int row = 0;
        for (Assignment assignment : assignments) {
            if (assignment.length() != 0) {
                // NOTE: why is this only using the first cluster?  Is this a
                // bug? -david
                int clus = assignment.assignments()[0];
                // Skip items whose cluster assignment indicates they were not
                // assigned to any cluster.
                if (clus < 0)
                    continue;
                counts[clus]++;
                DoubleVector centroid = centroids[assignment.assignments()[0]];
                VectorMath.add(centroid, matrix.getRowVector(row));
            }
            row++;
        }


        // Scale any non empty clusters by their size.
        for (int c = 0; c < numClusters; ++c)
            if (counts[c] != 0)
                centroids[c] = new ScaledDoubleVector(
                        centroids[c],1d/counts[c]);


        return centroids;
    }


    /**
     * Returns an array of sparse centroid vectors of each discovered cluster
     * which are scaled according the the number of data points asisgned to that
     * cluster.  This assumes that the original {@link Matrix} is sparse.  Note
     * that this method assumes that the original {@link Matrix} holding the
     * data points contains rows of feature vectors.  
     */
    public SparseDoubleVector[] getSparseCentroids() {
        if (matrix == null)
            throw new IllegalArgumentException(
                    "The data matrix was not passed to Assignments.");
        if (!(matrix instanceof SparseMatrix)) {
            LOGGER.fine(
                "The underlying matrix that was clustered is not sparse, " +
                "so sparse centroids may not be sparse");
        }
        
        // Initialzie the centroid vectors and the cluster sizes.
        SparseDoubleVector[] centroids = new SparseDoubleVector[numClusters];


        // If for some odd reason, no clusters were found, return no centroids.
        if (numClusters == 0)
            return centroids;


        counts = new int[numClusters];
        for (int c = 0; c < numClusters; ++c)
            centroids[c] = new CompactSparseVector(matrix.columns());


        // For each initial assignment, add the vector to it's centroid and
        // increase the size of the cluster.
        int row = 0;
        for (Assignment assignment : assignments) {
            // NOTE: why is this only using the first cluster?  Is this a
            // bug? -david
            //
            // Skip items whose cluster assignment indicates they were not
            // assigned to any cluster.
            if (assignment.length() != 0 && assignment.assignments()[0] >= 0) {
                counts[assignment.assignments()[0]]++;
                DoubleVector centroid = centroids[assignment.assignments()[0]];
                VectorMath.add(centroid, matrix.getRowVector(row));
            }
            row++;
        }


        // Scale any non empty clusters by their size.
        for (int c = 0; c < numClusters; ++c)
            if (counts[c] != 0)
                centroids[c] = new ScaledSparseDoubleVector(
                        centroids[c],1d/counts[c]);


        return centroids;
    }


    public int[] clusterSizes() {
        return counts;
    }
}
Source Code of edu.ucla.sspace.clustering.Assignments

Related Classes of edu.ucla.sspace.clustering.Assignments