Package edu.ucla.sspace.clustering

Source Code of edu.ucla.sspace.clustering.Streemer

/*
* Copyright 2012 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.ucla.sspace.clustering;

import edu.ucla.sspace.matrix.Matrix;

import edu.ucla.sspace.similarity.SimilarityFunction;

import edu.ucla.sspace.util.primitive.IntIterator;
import edu.ucla.sspace.util.primitive.IntSet;
import edu.ucla.sspace.util.primitive.TroveIntSet;

import edu.ucla.sspace.vector.DenseVector;
import edu.ucla.sspace.vector.DoubleVector;
import edu.ucla.sspace.vector.SparseVector;
import edu.ucla.sspace.vector.SparseHashDoubleVector;
import edu.ucla.sspace.vector.VectorMath;
import edu.ucla.sspace.vector.Vectors;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;


/**
* An implementation of the Streemer (<b>Stre</b>aming <b>EM</b>) algorithm for
* foreground/background clustering.  The full algorithm is described in
*
* <ul> <li> V. Kandylas, S. P. Upham, and L. Ungar.  Finding cohesive clusters
*   for analyzing knowledge communities.  In <i>Proceedings of the Seventh IEEE
*   Conference on Data Mining (ICDM)</i>.  IEEE Computer Society.  Omaha, NE.
*   Available online <a
*   href="http://citeseerx.ist.psu.edu/viewdoc/download;?doi=10.1.1.146.2954&rep=rep1&type=pdf">here</a>
*   </li> </ul>
*
* </p>
*
*
* @author David Jurgens
*/
public class Streemer implements Clustering {

    /**
     * Clusters the set of rows in the given {@code Matrix} without a specified
     * number of clusters
     *
     * @param matrix {@inheritDoc}
     * @param props {@inheritDoc}
     *
     * @return {@inheritDoc}
     */
    public Assignments cluster(Matrix matrix, Properties props) {
        throw new Error();
    }

    /**
     * Clusters the set of rows in the given {@code Matrix} into the specified
     * number of clusters.  The set of cluster assignments are returned for each
     * row in the matrix.
     *
     * @param matrix {@inheritDoc}
     * @param numClusters {@inheritDoc}
     * @param props {@inheritDoc}
     *
     * @return {@inheritDoc}
     */
    public Assignments cluster(Matrix matrix, int numClusters, Properties props) {
        throw new Error();
    }

    public Assignments cluster(Matrix matrix, int numClusters,
                               double backgroundClusterPerc, double similarityThreshold,
                               int minClusterSize, SimilarityFunction simFunc) {

        /*
         * FIRST PASS: generate a list of candidate clusters
         */       
        int rows = matrix.rows();
        List<CandidateCluster> candidateClusters =
            new ArrayList<CandidateCluster>();

        // Base case: the first row gets its own cluster
        CandidateCluster first = new CandidateCluster();
        first.add(0, matrix.getRowVector(0));
        candidateClusters.add(first);

        // Loop through all remaining rows, either assigning them to the most
        // similar cluster, or splitting them off into their own cluster
        for (int r = 1; r < rows; ++r) {
            DoubleVector row = matrix.getRowVector(r);
            CandidateCluster mostSim = null;
            double highestSim = -1d;
            for (CandidateCluster cc : candidateClusters) {
                double sim = simFunc.sim(cc.centerOfMass(), row);
                if (sim > highestSim) {
                    mostSim = cc;
                    highestSim = sim;
                }                   
            }
           
            if (highestSim < similarityThreshold) {
                CandidateCluster cc = new CandidateCluster();
                cc.add(r, row);
                candidateClusters.add(cc);
            }
            else {
                mostSim.add(r, row);
            }
        }       

        /*
         * Generate the list of final clusters
         */
        List<CandidateCluster> finalClusters =
            new ArrayList<CandidateCluster>();

        for (CandidateCluster cc : candidateClusters) {
            if (cc.size() < minClusterSize)
                continue;
           
            double maxSim = -1;
            for (CandidateCluster cc2 : candidateClusters) {
                if (cc == cc2)
                    continue;
                double sim = simFunc.sim(cc.centerOfMass(), cc2.centerOfMass());
                if (sim > maxSim)
                    maxSim = sim;
            }
            if (maxSim < similarityThreshold)
                finalClusters.add(cc);
            // Compute the cluster cohesiveness for all clusters with sim >
            // threshold, adding the cluster with the highest to the final set
            else {
                CandidateCluster mostCohesive = null;
                double maxCohesiveness = -1;
                for (CandidateCluster cc2 : candidateClusters) {
                    if (cc == cc2)
                        continue;
                    double sim = simFunc.sim(cc.centerOfMass(), cc2.centerOfMass());
                    if (sim < similarityThreshold)
                        continue;

                    IntIterator iter = cc2.indices().iterator();
                    double similaritySum = 0;
                    while (iter.hasNext()) {
                        DoubleVector v = matrix.getRowVector(iter.next());
                        similaritySum += simFunc.sim(cc2.centerOfMass(), v);
                    }
                    double avgSim = similaritySum / cc2.size();
                   
                    if (avgSim > maxCohesiveness) {
                        maxCohesiveness = avgSim;
                        mostCohesive = cc2;
                    }
                }
                finalClusters.add(mostCohesive);
            }
        }

       
        /*
         * OPTIONAL STEP: if we're inducing the number of clusters, keep the set
         * of final clusters as is; otherwise, ensure that the size of the set
         * is equal to the requested number of clusters
         */
        // TODO!

        int foundClusters = finalClusters.size();

        /*
         * THIRD PASS: compute the similarity distribution
         */
        double[] similarities = new double[rows];
        int[] clusterAssignments = new int[rows];
        for (int r = 0; r < rows; ++r) {
            DoubleVector v = matrix.getRowVector(r);
            double highestSim = -1;
            int mostSim = -1;
            for (int j = 0; j < foundClusters; ++j) {
                CandidateCluster cc = finalClusters.get(j);
                double sim = simFunc.sim(v, cc.centerOfMass());
                if (sim > highestSim) {
                    mostSim = j;
                    highestSim = sim;
                }
            }
            similarities[r] = highestSim;
            clusterAssignments[r] = mostSim;
        }

        // Create a copy of the similarities, which we'll sort and then use to
        // determine what the cutoff bound is for the background cluster
        double[] copy = Arrays.copyOf(similarities, similarities.length);
        Arrays.sort(copy);
        double cutoffSim = copy[(int)(copy.length * backgroundClusterPerc)];

        // Any data point whose similarity is less than the cutoff is relegated
        // to the background cluster.  (The next cluster id is the number of
        // clusters)
        int backgroundClusterId = foundClusters;

        Assignment[] assignments = new Assignment[rows];
        for (int i = 0; i < similarities.length; ++i) {
            if (similarities[i] < cutoffSim)
                clusterAssignments[i] = backgroundClusterId;
            assignments[i] = new HardAssignment(clusterAssignments[i]);
        }

        return new Assignments(foundClusters + 1, assignments, matrix);       
    }
}
TOP

Related Classes of edu.ucla.sspace.clustering.Streemer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.