/*
Copyright 2008-2010 Gephi
Authors : Mathieu Bastian <mathieu.bastian@gephi.org>
Website : http://www.gephi.org
This file is part of Gephi.
Gephi is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Gephi is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Gephi. If not, see <http://www.gnu.org/licenses/>.
*/
package org.gephi.clustering.plugin.mcl;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.gephi.clustering.api.Cluster;
import org.gephi.clustering.spi.Clusterer;
import org.gephi.graph.api.Edge;
import org.gephi.graph.api.Graph;
import org.gephi.graph.api.GraphModel;
import org.gephi.graph.api.Node;
import org.gephi.utils.longtask.spi.LongTask;
import org.gephi.utils.progress.Progress;
import org.gephi.utils.progress.ProgressTicket;
/**
* MarkovClustering implements the Markov clustering (MCL) algorithm for graphs,
* using a HashMap-based sparse representation of a Markov matrix, i.e., an
* adjacency matrix m that is normalised to one. Elements in a column / node can
* be interpreted as decision probabilities of a random walker being at that
* node. Note: whereas we explain the algorithms with columns, the actual
* implementation works with rows because the used sparse matrix has row-major
* format.
* <p>
* The basic idea underlying the MCL algorithm is that dense regions in sparse
* graphs correspond with regions in which the number of k-length paths is
* relatively large, for small k in N, which corresponds to multiplying
* probabilities in the matrix appropriately. Random walks of length k have
* higher probability (product) for paths with beginning and ending in the same
* dense region than for other paths.
* <p>
* The algorithm starts by creating a Markov matrix from the graph, for which
* first the adjacency matrix is added diagonal elements to include self-loops
* for all nodes, i.e., probabilities that the random walker stays at a
* particular node. After this initialisation, the algorithm works by
* alternating two operations, expansion and inflation, iteratively recomputing
* the set of transition probabilities. The expansion step corresponds to matrix
* multiplication (on stochastic matrices), the inflation step corresponds with
* a parametrized inflation operator Gamma_r, which acts column-wise on (column)
* stochastic matrices (here, we use row-wise operation, which is analogous).
* <p>
* The inflation operator transforms a stochastic matrix into another one by
* raising each element to a positive power p and re-normalising columns to keep
* the matrix stochastic. The effect is that larger probabilities in each column
* are emphasised and smaller ones deemphasised. On the other side, the matrix
* multiplication in the expansion step creates new non-zero elements, i.e.,
* edges. The algorithm converges very fast, and the result is an idempotent
* Markov matrix, M = M * M, which represents a hard clustering of the graph
* into components.
* <p>
* Expansion and inflation have two opposing effects: While expansion flattens
* the stochastic distributions in the columns and thus causes paths of a random
* walker to become more evenly spread, inflation contracts them to favoured
* paths.
* <p>
* Description is based on the introduction of Stijn van Dongen's thesis Graph
* Clustering by Flow Simulation (2000); for a mathematical treatment of the
* algorithm and the associated MCL process, see there.
*/
//Original author Gregor Heinrich
public class MarkovClustering implements Clusterer, LongTask {
private double maxResidual = 0.001;
private double gammaExp = 2.0;
private double loopGain = 0.;
private double zeroMax = 0.001;
private Cluster[] clusters;
//LongTask
private ProgressTicket progressTicket;
private boolean cancelled;
public void execute(GraphModel graphModel) {
cancelled = false;
Progress.start(progressTicket);
Progress.setDisplayName(progressTicket, "MCL Clustering");
HashMap<Integer, Node> nodeMap = new HashMap<Integer, Node>();
HashMap<Node, Integer> intMap = new HashMap<Node, Integer>();
Graph graph = graphModel.getGraphVisible();
graph.readLock();
//Load matrix
SparseMatrix matrix = new SparseMatrix();
int nodeId = 0;
for (Edge e : graph.getEdges()) {
Node source = e.getSource();
Node target = e.getTarget();
Integer sourceId;
Integer targetId;
if ((sourceId = intMap.get(source)) == null) {
sourceId = nodeId++;
intMap.put(source, sourceId);
nodeMap.put(sourceId, source);
}
if ((targetId = intMap.get(target)) == null) {
targetId = nodeId++;
intMap.put(target, targetId);
nodeMap.put(targetId, target);
}
double weight = e.getWeight();
matrix.add(sourceId, targetId, weight);
if (cancelled) {
graph.readUnlockAll();
return;
}
}
graph.readUnlock();
matrix = matrix.transpose();
matrix = run(matrix, maxResidual, gammaExp, loopGain, zeroMax);
if (cancelled) {
return;
}
Map<Integer, ArrayList<Integer>> map = getClusters(matrix);
if (cancelled) {
return;
}
int clusterNumber = 1;
List<Cluster> clustersList = new ArrayList<Cluster>();
Set<ArrayList<Integer>> sortedClusters = new HashSet<ArrayList<Integer>>();
for (ArrayList<Integer> c : map.values()) {
if (!sortedClusters.contains(c)) {
sortedClusters.add(c);
Node[] nodes = new Node[c.size()];
int i = 0;
for (Integer in : c) {
Node node = nodeMap.get(in);
nodes[i] = node;
i++;
}
clustersList.add(new MCLCluster(nodes, clusterNumber));
clusterNumber++;
}
if (cancelled) {
return;
}
}
clusters = clustersList.toArray(new Cluster[0]);
Progress.finish(progressTicket);
}
public Cluster[] getClusters() {
return clusters;
}
public boolean cancel() {
cancelled = true;
return true;
}
public void setProgressTicket(ProgressTicket progressTicket) {
this.progressTicket = progressTicket;
}
private static class MCLCluster implements Cluster {
private Node[] nodes;
private String name;
private Node metaNode;
public MCLCluster(Node[] nodes, int number) {
this.nodes = nodes;
this.name = "Cluster " + number;
}
public Node[] getNodes() {
return nodes;
}
public int getNodesCount() {
return nodes.length;
}
public String getName() {
return name;
}
public Node getMetaNode() {
return metaNode;
}
public void setMetaNode(Node node) {
this.metaNode = node;
}
}
/**
* run the MCL process.
*
* @param a matrix
* @param maxResidual maximum difference between row elements and row square
* sum (measure of idempotence)
* @param pGamma inflation exponent for Gamma operator
* @param loopGain values for cycles
* @param maxZero maximum value considered zero for pruning operations
* @return the resulting matrix
*/
public SparseMatrix run(SparseMatrix a, double maxResidual, double pGamma, double loopGain, double maxZero) {
//System.out.println("original matrix\n" + a.transpose().toStringDense());
// add cycles
addLoops(a, loopGain);
// make stochastic
a.normaliseRows();
//System.out.println("normalised\n" + a.transpose().toStringDense());
double residual = 1.;
int i = 0;
if (cancelled) {
return a;
}
// main iteration
while (residual > maxResidual) {
i++;
a = expand(a);
residual = inflate(a, pGamma, maxZero);
System.out.println("residual energy = " + residual);
if (cancelled) {
return a;
}
}
return a;
}
/**
* inflate stochastic matrix by Hadamard (elementwise) exponentiation,
* pruning and normalisation :
* <p>
* result = Gamma ( m, p ) = normalise ( prune ( m .^ p ) ).
* <p>
* By convention, normalisation is done along rows (SparseMatrix has
* row-major representation)
*
* @param m matrix (mutable)
* @param p exponent as a double
* @param zeromax below which elements are pruned from the sparse matrix
* @return residuum value, m is modified.
*/
public double inflate(SparseMatrix m, double p, double zeromax) {
double res = 0.;
// matlab: m = m .^ p
m.hadamardPower(p);
// matlab: m(find(m < threshold)) = 0
m.prune(zeromax);
// matlab [for cols]: dinv = diag(1./sum(m)); m = m * dinv; return
// sum(m)
SparseVector rowsums = m.normalise(1.);
// check if done: if the maximum element
for (int i : rowsums.keySet()) {
SparseVector row = m.get(i);
double max = row.max();
double sumsq = row.sum(2.);
res = Math.max(res, max - sumsq);
if (cancelled) {
return res;
}
}
return res;
}
/**
* expand stochastic quadratic matrix by sqaring it with itself: result = m *
* m. Here normalisation is rowwise.
*
* @param matrix
* @return new matrix (pointer != argument)
*/
public SparseMatrix expand(SparseMatrix m) {
m = m.times(m);
return m;
}
/**
* add loops with specific energy, which corresponds to adding loopGain to
* the diagonal elements.
*
* @param a
* @param loopGain
*/
private void addLoops(SparseMatrix a, double loopGain) {
if (loopGain <= 0) {
return;
}
for (int i = 0; i < a.size(); i++) {
a.add(i, i, loopGain);
if (cancelled) {
return;
}
}
}
public double getGammaExp() {
return gammaExp;
}
/**
* Set inflation exponent for Gamma operator. Default is 2.0
*/
public void setGammaExp(double gammaExp) {
this.gammaExp = gammaExp;
}
public double getLoopGain() {
return loopGain;
}
/**
* Set values for cycles. Default is 0.0
*/
public void setLoopGain(double loopGain) {
this.loopGain = loopGain;
}
public double getMaxResidual() {
return maxResidual;
}
/**
* Set the maximum difference between row elements and row square sum (measure of idempotence). Default is 0.001
*/
public void setMaxResidual(double maxResidual) {
this.maxResidual = maxResidual;
}
public double getZeroMax() {
return zeroMax;
}
/**
* Set the maximum value considered zero for pruning operations. Default is 0.001
*/
public void setZeroMax(double zeroMax) {
this.zeroMax = zeroMax;
}
private Map<Integer, ArrayList<Integer>> getClusters(SparseMatrix matrix) {
Map<Integer, ArrayList<Integer>> clusters = new HashMap<Integer, ArrayList<Integer>>();
int clusterCount = 0;
double[][] mat = matrix.getDense();
for (int i = 0; i < mat.length; i++) {
for (int j = 0; j < mat[0].length; j++) {
double value = mat[i][j];
if (value != 0.0) {
if (i == j) {
continue;
}
if (clusters.containsKey(j)) {
// Already seen "column" -- get the cluster and add column
ArrayList<Integer> columnCluster = clusters.get(j);
if (clusters.containsKey(i)) {
// We've already seen row also -- join them
ArrayList<Integer> rowCluster = clusters.get(i);
if (rowCluster == columnCluster) {
continue;
}
columnCluster.addAll(rowCluster);
clusterCount--;
} else {
// debugln("Adding "+row+" to "+columnCluster.getClusterNumber());
columnCluster.add(i);
}
for (Integer in : columnCluster) {
clusters.put(in, columnCluster);
}
} else {
ArrayList<Integer> rowCluster;
// First time we've seen "column" -- have we already seen "row"
if (clusters.containsKey(i)) {
// Yes, just add column to row's cluster
rowCluster = clusters.get(i);
// debugln("Adding "+column+" to "+rowCluster.getClusterNumber());
rowCluster.add(j);
} else {
rowCluster = new ArrayList<Integer>();
clusterCount++;
// debugln("Created new cluster "+rowCluster.getClusterNumber()+" with "+row+" and "+column);
rowCluster.add(j);
rowCluster.add(i);
}
for (Integer in : rowCluster) {
clusters.put(in, rowCluster);
}
}
}
if (cancelled) {
return clusters;
}
}
}
return clusters;
}
}