package edu.gslis.ttg.clusters.clusterers;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableMap;
import cc.twittertools.thrift.gen.TResult;
import edu.gslis.ttg.clusters.Clusters;
import edu.gslis.ttg.jaccard.JaccardStore;
public class SimpleJaccardClusterer {
private List<TResult> results;
private JaccardStore jaccardScores;
public SimpleJaccardClusterer(List<TResult> results) {
this.results = results;
this.jaccardScores = computeJaccardSimilarity();
}
public Clusters cluster(double threshold) {
Clusters clusters = new Clusters();
NavigableMap<Double, List<long[]>> thresholdPairs = jaccardScores.getDocsGreaterThanScore(threshold);
Iterator<Double> pairsIt = thresholdPairs.keySet().iterator();
while (pairsIt.hasNext()) { // for each pair of documents matching this jaccard score
List<long[]> docPairs = thresholdPairs.get(pairsIt.next());
Iterator<long[]> docPairIt = docPairs.iterator();
while (docPairIt.hasNext()) { //
long[] docs = docPairIt.next();
clusters.mergeMembers(docs[0], docs[1]);
}
}
return clusters;
}
public List<TResult> getResults() {
return results;
}
public void setResults(List<TResult> results) {
this.results = results;
}
private JaccardStore computeJaccardSimilarity() {
// compute jaccard similarity for each pair of results
JaccardStore scores = new JaccardStore();
for (int j = 0; j < results.size(); j++) {
TResult doc1 = results.get(j);
for (int k = j + 1; k < results.size(); k++) {
TResult doc2 = results.get(k);
double jaccardSim = JaccardStore.computeJaccardSimilarity(doc1.getText(), doc2.getText());
scores.setScore(doc1.getId(), doc2.getId(), jaccardSim);
}
}
return scores;
}
}