// BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
// outfile), enc2));
StopWords sw = new StopWords(stopwordfile);
LabelAlphabet dict = new LabelAlphabet();
// words in documents
ArrayList<TIntArrayList> documentsList= new ArrayList<TIntArrayList>();
String line = null;
while ((line = in.readLine()) != null) {
line = line.trim();
if(line.length()==0)
continue;
String[] toks = line.split("\\s+");
TIntArrayList wordlist = new TIntArrayList();
for(int j=0;j<toks.length;j++){
String tok = toks[j];
if(sw.isStopWord(tok))
continue;
int idx = dict.lookupIndex(tok);
wordlist.add(idx);
}
documentsList.add(wordlist);
}
in.close();
int[][] documents;
documents = new int[documentsList.size()][];
for(int i=0;i<documents.length;i++){
documents[i] = documentsList.get(i).toArray();
}
// vocabulary
int V = dict.size();
int M = documents.length;
// # topics
int K = 4;
// good values alpha = 2, beta = .5
float alpha = 2f;
float beta = .5f;
System.out.println("Latent Dirichlet Allocation using Gibbs Sampling.");
LdaGibbsSampler lda = new LdaGibbsSampler(documents, V);
lda.configure(10000, 2000, 100, 10);
lda.gibbs(K, alpha, beta);
float[][] theta = lda.getTheta();
float[][] phi = lda.getPhi();
System.out.println();
System.out.println();
System.out.println("Document--Topic Associations, Theta[d][k] (alpha="
+ alpha + ")");
System.out.print("d\\k\t");
for (int m = 0; m < theta[0].length; m++) {
System.out.print(" " + m % 10 + " ");
}
System.out.println();
for (int m = 0; m < theta.length; m++) {
System.out.print(m + "\t");
for (int k = 0; k < theta[m].length; k++) {
// System.out.print(theta[m][k] + " ");
System.out.print(shadefloat(theta[m][k], 1) + " ");
}
System.out.println();
}
System.out.println();
System.out.println("Topic--Term Associations, Phi[k][w] (beta=" + beta
+ ")");
System.out.print("k\\w\t");
for (int w = 0; w < phi[0].length; w++) {
System.out.print(" " + dict.lookupString(w) + " ");
}
System.out.println();
for (int k = 0; k < phi.length; k++) {
System.out.print(k + "\t");
for (int w = 0; w < phi[k].length; w++) {
System.out.print(lnf.format(phi[k][w]) + " ");
// System.out.print(phi[k][w] + " ");
// System.out.print(shadefloat(phi[k][w], 1) + " ");
}
System.out.println();
}
for (int k = 0; k < phi.length; k++) {
int[] top = MyArrays.sort(phi[k]);
for (int w = 0; w < 10; w++) {
System.out.print(dict.lookupString(top[w]) + " ");
}
System.out.println();
}
}