package syntaxLearner;
import java.util.Set;
import java.util.TreeSet;
import syntaxLearner.corpus.Vocabulary;
import syntaxLearner.corpus.Word;
public class Cluster {
public final short ID;
public int totalSize; //size of this portion of the corpus
public int wordCount; //individual word
public Set<Integer> words;
//private Map<Context,Double> distribution;
private double[][] distribution;
private final Vocabulary vocab;
private final Learner learner;
private boolean isNew = false;
public Cluster(Vocabulary v, Learner l){
this.vocab=v;
this.learner=l;
this.ID = learner.newClusterID();
words = new TreeSet<Integer>();
reset();
}
public Cluster (Vocabulary v, Learner l, boolean isGround){
this.vocab=v;
this.learner=l;
this.ID = isGround? -1 : learner.newClusterID();
words = new TreeSet<Integer>();
reset();
}
/**
* Calculates the distribution of an entire Cluster.
* Similar to calculating a centroid with k-means.
* @return a "Map" vector
*/
public double[][] clusterDistribution(){
if (learner.isClusterUpdated(this)) {
return distribution;
} else {
/* Initialize */
distribution = new double[learner.NUMBER_OF_CLUSTERS+1][learner.NUMBER_OF_CLUSTERS+1];
Set<ClusterContext> clusterContexts = learner.getClusterContexts();
/* Sum every context in every word */
for (int i : words){
Word w = vocab.getWord(i);
double weight = (1.0*w.frequency)/(totalSize*wordCount);
for (ClusterContext cc : clusterContexts){
distribution[cc.type1 + 1][cc.type2 + 1]+= (w.clusterDistribution(cc)*weight);
}
learner.registerClusterUpdate(this);
}
return distribution;
}
}
/* Sets parenthood */
public void add(Cluster c){
//Actually asserting that intersection (this, c) is empty.
for (int i: c.words){
vocab.getWord(i).setParent(this);
learner.setParent(i,this.ID);
}
words.addAll(c.words);
totalSize+=c.totalSize;
wordCount+=c.wordCount;
}
/* Adds, counts the values and sets parenthood */
public void add(int i){
words.add(i);
wordCount++;
Word w = vocab.getWord(i);
totalSize+=w.frequency;
w.setParent(this);
}
public void remove(int i){
if (words.contains(i)){
words.remove(i);
wordCount--;
totalSize-=vocab.getWord(i).frequency;
assert (totalSize>=0 && wordCount>=0);
}
}
/**
* Wipes all data apart from the parent vocabulary.
* Done this way instead of destructing the object
* altogether in order to keep a constant number of
* clusters in the count.
*/
public void reset(){
words.clear();
wordCount = 0;
totalSize=0;
}
public short getID(){
return ID;
}
public boolean isNew(){
return isNew;
}
public void setNew(boolean b){
isNew=b;
}
}