Source Code of edu.ucla.sspace.clustering.Streemer

/*
 * Copyright 2012 David Jurgens
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.clustering;


import edu.ucla.sspace.matrix.Matrix;


import edu.ucla.sspace.similarity.SimilarityFunction;


import edu.ucla.sspace.util.primitive.IntIterator;
import edu.ucla.sspace.util.primitive.IntSet;
import edu.ucla.sspace.util.primitive.TroveIntSet;


import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.vector.Vectors;


import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;




/**
 * An implementation of the Streemer (<b>Stre</b>aming <b>EM</b>) algorithm for
 * foreground/background clustering.  The full algorithm is described in 
 *
 * <ul> <li> V. Kandylas, S. P. Upham, and L. Ungar.  Finding cohesive clusters
 *   for analyzing knowledge communities.  In <i>Proceedings of the Seventh IEEE
 *   Conference on Data Mining (ICDM)</i>.  IEEE Computer Society.  Omaha, NE.
 *   Available online <a
 *   href="http://citeseerx.ist.psu.edu/viewdoc/download;?doi=10.1.1.146.2954&rep=rep1&type=pdf">here</a>
 *   </li> </ul>
 *
 * </p>
 *
 *
 * @author David Jurgens
 */
public class Streemer implements Clustering {


    /**
     * Clusters the set of rows in the given {@code Matrix} without a specified
     * number of clusters 
     *
     * @param matrix {@inheritDoc}
     * @param props {@inheritDoc}
     *
     * @return {@inheritDoc}
     */
    public Assignments cluster(Matrix matrix, Properties props) {
        throw new Error();
    }


    /**
     * Clusters the set of rows in the given {@code Matrix} into the specified
     * number of clusters.  The set of cluster assignments are returned for each
     * row in the matrix.
     *
     * @param matrix {@inheritDoc}
     * @param numClusters {@inheritDoc}
     * @param props {@inheritDoc}
     *
     * @return {@inheritDoc}
     */
    public Assignments cluster(Matrix matrix, int numClusters, Properties props) {
        throw new Error();
    }


    public Assignments cluster(Matrix matrix, int numClusters, 
                               double backgroundClusterPerc, double similarityThreshold,
                               int minClusterSize, SimilarityFunction simFunc) {


        /*
         * FIRST PASS: generate a list of candidate clusters
         */        
        int rows = matrix.rows();
        List<CandidateCluster> candidateClusters = 
            new ArrayList<CandidateCluster>();


        // Base case: the first row gets its own cluster
        CandidateCluster first = new CandidateCluster();
        first.add(0, matrix.getRowVector(0));
        candidateClusters.add(first);


        // Loop through all remaining rows, either assigning them to the most
        // similar cluster, or splitting them off into their own cluster
        for (int r = 1; r < rows; ++r) {
            DoubleVector row = matrix.getRowVector(r);
            CandidateCluster mostSim = null;
            double highestSim = -1d;
            for (CandidateCluster cc : candidateClusters) {
                double sim = simFunc.sim(cc.centerOfMass(), row);
                if (sim > highestSim) {
                    mostSim = cc;
                    highestSim = sim;
                }                    
            }
            
            if (highestSim < similarityThreshold) {
                CandidateCluster cc = new CandidateCluster();
                cc.add(r, row);
                candidateClusters.add(cc);
            }
            else {
                mostSim.add(r, row);
            }
        }        


        /*
         * Generate the list of final clusters
         */
        List<CandidateCluster> finalClusters = 
            new ArrayList<CandidateCluster>();


        for (CandidateCluster cc : candidateClusters) {
            if (cc.size() < minClusterSize) 
                continue;
            
            double maxSim = -1;
            for (CandidateCluster cc2 : candidateClusters) {
                if (cc == cc2)
                    continue;
                double sim = simFunc.sim(cc.centerOfMass(), cc2.centerOfMass());
                if (sim > maxSim)
                    maxSim = sim;
            }
            if (maxSim < similarityThreshold)
                finalClusters.add(cc);
            // Compute the cluster cohesiveness for all clusters with sim >
            // threshold, adding the cluster with the highest to the final set
            else {
                CandidateCluster mostCohesive = null;
                double maxCohesiveness = -1;
                for (CandidateCluster cc2 : candidateClusters) {
                    if (cc == cc2)
                        continue;
                    double sim = simFunc.sim(cc.centerOfMass(), cc2.centerOfMass());
                    if (sim < similarityThreshold) 
                        continue;


                    IntIterator iter = cc2.indices().iterator();
                    double similaritySum = 0;
                    while (iter.hasNext()) {
                        DoubleVector v = matrix.getRowVector(iter.next());
                        similaritySum += simFunc.sim(cc2.centerOfMass(), v);
                    }
                    double avgSim = similaritySum / cc2.size();
                    
                    if (avgSim > maxCohesiveness) {
                        maxCohesiveness = avgSim;
                        mostCohesive = cc2;
                    }
                }
                finalClusters.add(mostCohesive);
            }
        }


        
        /*
         * OPTIONAL STEP: if we're inducing the number of clusters, keep the set
         * of final clusters as is; otherwise, ensure that the size of the set
         * is equal to the requested number of clusters
         */
        // TODO!


        int foundClusters = finalClusters.size();


        /*
         * THIRD PASS: compute the similarity distribution
         */
        double[] similarities = new double[rows];
        int[] clusterAssignments = new int[rows];
        for (int r = 0; r < rows; ++r) {
            DoubleVector v = matrix.getRowVector(r);
            double highestSim = -1;
            int mostSim = -1;
            for (int j = 0; j < foundClusters; ++j) {
                CandidateCluster cc = finalClusters.get(j);
                double sim = simFunc.sim(v, cc.centerOfMass());
                if (sim > highestSim) {
                    mostSim = j;
                    highestSim = sim;
                }
            }
            similarities[r] = highestSim;
            clusterAssignments[r] = mostSim;
        }


        // Create a copy of the similarities, which we'll sort and then use to
        // determine what the cutoff bound is for the background cluster
        double[] copy = Arrays.copyOf(similarities, similarities.length);
        Arrays.sort(copy);
        double cutoffSim = copy[(int)(copy.length * backgroundClusterPerc)];


        // Any data point whose similarity is less than the cutoff is relegated
        // to the background cluster.  (The next cluster id is the number of
        // clusters)
        int backgroundClusterId = foundClusters;


        Assignment[] assignments = new Assignment[rows];
        for (int i = 0; i < similarities.length; ++i) {
            if (similarities[i] < cutoffSim)
                clusterAssignments[i] = backgroundClusterId;
            assignments[i] = new HardAssignment(clusterAssignments[i]);
        }


        return new Assignments(foundClusters + 1, assignments, matrix);        
    }
}
Source Code of edu.ucla.sspace.clustering.Streemer

Related Classes of edu.ucla.sspace.clustering.Streemer