/*
* Copyright 2011 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.ucla.sspace.clustering;
import edu.ucla.sspace.vector.CompactSparseVector;
import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.ScaledDoubleVector;
import edu.ucla.sspace.vector.SparseDoubleVector;
import edu.ucla.sspace.vector.ScaledSparseDoubleVector;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.SparseMatrix;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* The return value for all {@link Clustering} implementations. This class
* records the number of clusters created, the assignments for each value, and
* helper methods for constructing the centroids of a cluster.
*
* @author Keith Stevens
*/
public class Assignments implements Iterable<Assignment> {
/**
* The logger to which clustering status updates will be written.
*/
private static final Logger LOGGER =
Logger.getLogger(Assignments.class.getName());
/**
* The {@link Assignment}s made for each data point.
*/
private Assignment[] assignments;
/**
* The number of clusters found from a particular algorithm.
*/
private int numClusters;
/**
* The {@link Matrix} of data points that these {@link Assignments} link to.
*/
private Matrix matrix;
private int[] counts;
/**
* Creates a new {@link Assignments} instance that can hold up to {@code
* numAssignments} {@link Assignment}s. This assumes that the data matrix
* will not be accessible. Calls to {@link #getCentroids} will fail when
* using this constructor.
*/
public Assignments(int numClusters, int numAssignments) {
this(numClusters, numAssignments, null);
}
/**
* Creates a new {@link Assignments} instance that can hold up to {@code
* numAssignments} {@link Assignment}s.
*/
public Assignments(int numClusters, int numAssignments, Matrix matrix) {
this.numClusters = numClusters;
this.matrix = matrix;
assignments = new Assignment[numAssignments];
}
/**
* Creates a new {@link Assignments} instance that takes ownership of the
* {@code initialAssignments} array. This assumes that the data matrix will
* not be accessible. Calls to {@link #getCentroids} will fail when using
* this constructor.
*/
public Assignments(int numClusters,
Assignment[] initialAssignments) {
this(numClusters, initialAssignments, null);
}
/**
* Creates a new {@link Assignments} instance that takes ownership of the
* {@code initialAssignments} array.
*/
public Assignments(int numClusters,
Assignment[] initialAssignments,
Matrix matrix) {
this.numClusters = numClusters;
this.matrix = matrix;
assignments = initialAssignments;
}
/**
* Sets {@link Assignment} {@code i} to have value {@code assignment}.
*/
public void set(int i, Assignment assignment) {
assignments[i] = assignment;
}
/**
* Returns the number of {@link Assignment} objects stored.
*/
public int size() {
return assignments.length;
}
/**
* Returns an iterator over the {@link Assignment} objects stored.
*/
public Iterator<Assignment> iterator() {
return Arrays.asList(assignments).iterator();
}
/**
* Returns the {@link Assignment} object at index {@code i}.
*/
public Assignment get(int i) {
return assignments[i];
}
/**
* Returns the number of clusters.
*/
public int numClusters() {
return numClusters;
}
/**
* Returns the array of {@link Assignment} objects.
*/
public Assignment[] assignments() {
return assignments;
}
/**
* Returns the data point indices assigned to each cluster. Note that if
* the underlying clustering algorithm does not put some items in a cluster
* (i.e., their cluster assignment is negative) then these items will not be
* returned as a part of any cluster.
*/
public List<Set<Integer>> clusters() {
List<Set<Integer>> clusters = new ArrayList<Set<Integer>>();
for (int c = 0; c < numClusters; ++c)
clusters.add(new HashSet<Integer>());
for (int i = 0; i < assignments.length; ++i)
for (int k : assignments[i].assignments()) {
// Check that the item was place in a cluster
if (k >= 0)
clusters.get(k).add(i);
}
return clusters;
}
/**
* Returns an array of dense centroid vectors of each discovered cluster
* which are scaled according the the number of data points asisgned to that
* cluster. Note that this method assumes that the original {@link Matrix}
* holding the data points contains rows of feature vectors.
*/
public DoubleVector[] getCentroids() {
if (matrix == null)
throw new IllegalArgumentException(
"The data matrix was not passed to Assignments.");
// Initialzie the centroid vectors and the cluster sizes.
DoubleVector[] centroids = new DoubleVector[numClusters];
counts = new int[numClusters];
for (int c = 0; c < numClusters; ++c)
centroids[c] = new DenseVector(matrix.columns());
// For each initial assignment, add the vector to it's centroid and
// increase the size of the cluster.
int row = 0;
for (Assignment assignment : assignments) {
if (assignment.length() != 0) {
// NOTE: why is this only using the first cluster? Is this a
// bug? -david
int clus = assignment.assignments()[0];
// Skip items whose cluster assignment indicates they were not
// assigned to any cluster.
if (clus < 0)
continue;
counts[clus]++;
DoubleVector centroid = centroids[assignment.assignments()[0]];
VectorMath.add(centroid, matrix.getRowVector(row));
}
row++;
}
// Scale any non empty clusters by their size.
for (int c = 0; c < numClusters; ++c)
if (counts[c] != 0)
centroids[c] = new ScaledDoubleVector(
centroids[c],1d/counts[c]);
return centroids;
}
/**
* Returns an array of sparse centroid vectors of each discovered cluster
* which are scaled according the the number of data points asisgned to that
* cluster. This assumes that the original {@link Matrix} is sparse. Note
* that this method assumes that the original {@link Matrix} holding the
* data points contains rows of feature vectors.
*/
public SparseDoubleVector[] getSparseCentroids() {
if (matrix == null)
throw new IllegalArgumentException(
"The data matrix was not passed to Assignments.");
if (!(matrix instanceof SparseMatrix)) {
LOGGER.fine(
"The underlying matrix that was clustered is not sparse, " +
"so sparse centroids may not be sparse");
}
// Initialzie the centroid vectors and the cluster sizes.
SparseDoubleVector[] centroids = new SparseDoubleVector[numClusters];
// If for some odd reason, no clusters were found, return no centroids.
if (numClusters == 0)
return centroids;
counts = new int[numClusters];
for (int c = 0; c < numClusters; ++c)
centroids[c] = new CompactSparseVector(matrix.columns());
// For each initial assignment, add the vector to it's centroid and
// increase the size of the cluster.
int row = 0;
for (Assignment assignment : assignments) {
// NOTE: why is this only using the first cluster? Is this a
// bug? -david
//
// Skip items whose cluster assignment indicates they were not
// assigned to any cluster.
if (assignment.length() != 0 && assignment.assignments()[0] >= 0) {
counts[assignment.assignments()[0]]++;
DoubleVector centroid = centroids[assignment.assignments()[0]];
VectorMath.add(centroid, matrix.getRowVector(row));
}
row++;
}
// Scale any non empty clusters by their size.
for (int c = 0; c < numClusters; ++c)
if (counts[c] != 0)
centroids[c] = new ScaledSparseDoubleVector(
centroids[c],1d/counts[c]);
return centroids;
}
public int[] clusterSizes() {
return counts;
}
}