package edu.cmu.graphchi.util;
import edu.cmu.graphchi.vertexdata.ForeachCallback;
import edu.cmu.graphchi.vertexdata.VertexAggregator;
import edu.cmu.graphchi.datablocks.IntConverter;
import edu.cmu.graphchi.preprocessing.VertexIdTranslate;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
/**
* Utility for counting the number of different labels in the
* vertex-data. Vertices which have same id as their vertex-id are
* not calculated. This is used for connected components and community
* detection applications.
* @author Aapo Kyrola
*/
public class LabelAnalysis {
/* TODO: faster, more memory efficient */
/**
* Analyzes the labels of the vertices and outputs a file baseFilename + ".components"
* with label,count. Singletons labels not listed.
* @param baseFilename input graph fhile
* @param numVertices number of vertices in graph
* @param translate vertex-id translater (from internal to actual ids)
* @return
* @throws IOException
*/
public static Collection<IdCount> computeLabels(String baseFilename, int numVertices,
VertexIdTranslate translate) throws IOException {
final HashMap<Integer, IdCount> counts = new HashMap<Integer, IdCount>(1000000);
VertexAggregator.foreach(numVertices, baseFilename, new IntConverter(), new ForeachCallback<Integer>() {
public void callback(int vertexId, Integer vertexValue) {
if (vertexId != vertexValue) {
IdCount cnt = counts.get(vertexValue);
if (cnt == null) {
cnt = new IdCount(vertexValue, 1);
counts.put(vertexValue, cnt);
}
cnt.count++;
}
}
});
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(baseFilename + ".components"));
for(IdCount cnt : counts.values()) {
String s = translate.backward(cnt.id) + "," + cnt.count + "\n";
bos.write(s.getBytes());
}
bos.flush();
bos.close();
return counts.values();
}
}