Corpus corpus = null;
if(stem) corpus = Corpus.loadLabelled(topicType, "allprofiles-stemmed");
else corpus = Corpus.loadLabelled(topicType, "allprofiles-unstemmed");
//Check for model existence
LLDATopicModel llda = null;
if(new File("models/llda/"+topicType+"/"+description+".model").exists()) {
System.out.println("Found LLDA model "+description);
llda = LLDATopicModel.load(topicType,description);
} else {
System.out.println("Couldn't find LLDA model "+description+", creating new one");
llda = new LLDATopicModel(corpus,burn,sample,lag,1,0.01);
llda.runGibbsSampling();
llda.save(description);
}
try {
//Get the document topic distributions and store these
List<List<WordScore>> docTopics = llda.getDocuments();
int docID = 0;
for(List<WordScore> document : docTopics) {
Long userID = llda.getDocIDFromIndex(docID);
FileOutputStream fileOut = new FileOutputStream(dirName+"/"+userID+".csv");
PrintWriter writeOut = new PrintWriter(fileOut);
writeOut.println("\"topic\",\"probability\"");
for(WordScore topic : document) {
writeOut.println(topic.getWord()+","+topic.getScore());
}
writeOut.close();
docID++;
}
//NOTE: We are saving these for now. However, we always have a saved model
//and we can get these attributes from the model
//should also save the topic-word distributions
//okay, so we should definitely serialize topics and vocab
Map<String,Integer> vocab = llda.getVocab();
double[][] topics = llda.getTopicsUnsorted();
ArrayList<String> topicIDs = llda.getTopicsIDList();
//Save topics
FileOutputStream topicsFileOut = new FileOutputStream(dirName+"/TOPICS.obj");
ObjectOutputStream topicsObjectOut = new ObjectOutputStream(topicsFileOut);
topicsObjectOut.writeObject(topics);