Source Code of edu.ucla.sspace.clustering.DirectClustering

/*
 * Copyright 2011 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.clustering;


import edu.ucla.sspace.clustering.criterion.CriterionFunction;
import edu.ucla.sspace.clustering.criterion.I1Function;


import edu.ucla.sspace.clustering.seeding.KMeansSeed;
import edu.ucla.sspace.clustering.seeding.RandomSeed;


import edu.ucla.sspace.common.Similarity;
import edu.ucla.sspace.common.Statistics;


import edu.ucla.sspace.matrix.Matrix;
import edu.ucla.sspace.matrix.*;


import edu.ucla.sspace.util.ReflectionUtil;


import edu.ucla.sspace.vector.*;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.ScaledDoubleVector;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.VectorMath;


import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOError;
import java.io.IOException;


import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.Properties;




/**
 * An implementation of the Direct K-Means clustering available in the 
 * <a href="http://glaros.dtc.umn.edu/gkhome/views/cluto/">CLUTO Clustering
 * Software</a>.  This implementation of K-Means varies from the standard
 * implementation in a variety of ways:
 *
 * <li>
 *   <ol> After creating the initial set of clusters, each data point is
 *   assigned to a cluster if such a move improvies the overall clustering
 *   score.  This implementation is an iterative method, in that it makes the
 *   cluster change immediately and recomputes the centroid after every change.
 *   </ol>
 *   <ol> The clustering objective can be changed by using a different {@link
 *   edu.ucla.sspace.clustering.criterion.CriterionFunction CriterionFunction}.
 *   The {@link edu.ucla.sspace.clustering.criteria.I1Function I1Function}
 *   implements the standard K-Means criterion function.  Others try to optimize
 *   a variety of
 *   different objectives.</ol>
 * </li>
 *
 * @author Keith Stevens
 */
public class DirectClustering implements Clustering {


    /**
     * A property prefix for specifiying options when using {@link
     * DirectClustering}.
     */
    public static final String PROPERTY_PREFIX =
        "edu.ucla.sspace.clustering.DirectClustering";


    /**
     * The property to set the name of a {@link CriterionFunction} to use when
     * clustering the data.
     */
    public static final String CRITERIA_PROPERTY =
        PROPERTY_PREFIX + ".criteria";


    /**
     * The property to set the number of times a single run of {@link
     * DirectClustering} will be run.  If this is more than 1, the best scoring
     * run will be returned.
     */
    public static final String REPEAT_PROPERTY=
        PROPERTY_PREFIX + ".repeat";


    /**
     * A property for setting the {@link KMeansSeed} algorithm to use.
     */
    public static final String SEED_PROPERTY =
        PROPERTY_PREFIX + ".seed";


    /**
     * The default seed algorithm used.
     */
    private static final String DEFAULT_SEED =
        "edu.ucla.sspace.clustering.seeding.RandomSeed";


    /**
     * The default {@link CriterionFunction} to be used while clustering.
     */
    private static final String DEFAULT_CRITERION = 
        "edu.ucla.sspace.clustering.criterion.I1Function";


    /**
     * The default number of repetitions.
     */
    private static final String DEFAULT_REPEATS = "10";


    /**
     * Throws {@link UnsupportedOperationException}.
     */
    public Assignments cluster(Matrix matrix, Properties properties) {
        throw new UnsupportedOperationException(
                "DirectClustering requires the number of clusters to be set.");
    }


    /**
     * {@inheritDoc}
     */
    public Assignments cluster(Matrix matrix,
                               int numClusters,
                               Properties properties) {
        // Get the number of repitetions to use.
        int numRepetitions = Integer.parseInt(properties.getProperty(
                    REPEAT_PROPERTY, DEFAULT_REPEATS));


        // Get an instance of the seed generator.
        KMeansSeed seedType = ReflectionUtil.getObjectInstance(
                properties.getProperty(SEED_PROPERTY, DEFAULT_SEED));


        // Create the criterion function.
        CriterionFunction criterion = ReflectionUtil.getObjectInstance(
                properties.getProperty(CRITERIA_PROPERTY, DEFAULT_CRITERION));


        return cluster(matrix, numClusters, numRepetitions,
                       seedType, criterion);
    }


    /**
     * Clusters {@link matrix} using the {@link RandomSeed} seeding algorithm
     * and the default kmeans {@link CriterionFunction}.  The best scoring
     * solution out of {@code numRepetitions} will be returned.
     */
    public static Assignments cluster(Matrix matrix,
                                      int numClusters,
                                      int numRepetitions) {
        return cluster(matrix, numClusters, numRepetitions,
                       new RandomSeed(), new I1Function());
    }


    /**
     * Clusters {@link matrix} using the {@link RandomSeed}
     * seeding algorithm and the specified {@link CriterionFunction}. The best
     * scoring solution out of {@code numRepetitions} will be returned.
     */
    public static Assignments cluster(Matrix matrix,
                                      int numClusters,
                                      int numRepetitions,
                                      CriterionFunction criterion) {
        return cluster(matrix, numClusters, numRepetitions,
                       new RandomSeed(), criterion);
    }


    /**
     * Clusters {@link matrix} using the specified {@link SeedAlgorithm}
     * and the specified {@link CriterionFunction}. The best scoring solution
     * out of {@code numRepetitions} will be returned.
     */
    public static Assignments cluster(Matrix matrix,
                                      int numClusters,
                                      int numRepetitions,
                                      KMeansSeed seedType,
                                      CriterionFunction criterion) {
        int[] bestAssignment = null;
        double bestScore = (criterion.isMaximize()) ? 0 : Double.MAX_VALUE;
        for (int i = 0; i < numRepetitions; ++i) {
            clusterIteration(matrix, numClusters, seedType, criterion);
            if (criterion.isMaximize()) {
                if (criterion.score() > bestScore) {
                    bestScore = criterion.score();
                    bestAssignment = criterion.assignments();
                }
            } else {
                if (criterion.score() < bestScore) {
                    bestScore = criterion.score();
                    bestAssignment = criterion.assignments();
                }
            }
        }


        // Convert the array of assignments to an Assignments object.
        Assignment[] assignments = new Assignment[matrix.rows()];
        for (int i = 0; i < bestAssignment.length; ++i)
            assignments[i] = new HardAssignment(bestAssignment[i]);


        return new Assignments(numClusters, assignments, matrix);
    }


    /**
     * Performs one iteration of Direct Clustering over the data set.
     */
    private static void clusterIteration(Matrix matrix,
                                         int numClusters,
                                         KMeansSeed seedType,
                                         CriterionFunction criterion) {
        DoubleVector[] centers = seedType.chooseSeeds(numClusters, matrix);


        // Compute the initial set of assignments for each data point based on
        // the initial assignments.
        int[] initialAssignments = new int[matrix.rows()];


        // If there is to be only one cluster, then everything will be auto
        // assigned to the first cluster.  This is just a special case that only
        // comes up when comparing other solutions to the non-solution.
        if (numClusters != 1) {
            int nc = 0;
            for (int i = 0; i < matrix.rows(); ++i) {


                DoubleVector vector = matrix.getRowVector(i);
                double bestSimilarity = 0;
                for (int c = 0; c < numClusters; ++c) {
                    double similarity = Similarity.cosineSimilarity(
                            centers[c], vector);
                    nc++;
                    if (similarity >= bestSimilarity) {
                        bestSimilarity = similarity;
                        initialAssignments[i] = c;
                    }
                }
            }
        }


        // Setup the criterion function with it's meta data.
        criterion.setup(matrix, initialAssignments, numClusters);


        // Iteratively swap each data point to a better cluster if such an
        // assignment exists.
        List<Integer> indices = new ArrayList<Integer>(matrix.rows());
        for (int i = 0; i < matrix.rows(); ++i)
            indices.add(i);


        // Iterate through each data point in random order.  For each data
        // point, try to assign it to a new cluster.  If no data point is moved
        // in an iteration, end the iterations.
        boolean changed = true;
        while (changed) {
            changed = false;
            Collections.shuffle(indices);
            for (int index : indices)
                changed |= criterion.update(index);
        }
    }


    public String toString() {
        return "DirectClustering";
    }
}
Source Code of edu.ucla.sspace.clustering.DirectClustering

Related Classes of edu.ucla.sspace.clustering.DirectClustering