hasRun = false;
Set<Document> documentSet = corpus.getDocuments();
//Remove all no-topic documents to avoid breaking LLDA - use an iterator to avoid exceptions when removing
for(Iterator<Document> iter = documentSet.iterator(); iter.hasNext();) {
Document document = iter.next();
if(document.getTopics().isEmpty()) {
iter.remove();
}
}
numDocs = documentSet.size();
System.out.println("THREAD "+threadNum+": numDocs = "+numDocs);
documents = new int[numDocs][];
numWordsInDocument = new int[numDocs];
numTopicsInDocument = new int[numDocs];
wordIDs = new HashMap<String,Integer>();
topicIDs = new HashMap<String,Integer>();
idLookup = new ArrayList<String>();
topicLookup = new ArrayList<String>();
docIDLookup = new ArrayList<Document>();
docLabels = new ArrayList<ArrayList<Integer>>();
int docID = 0;
for(Document document : documentSet) {
docIDLookup.add(document);
ArrayList<Integer> labels = new ArrayList<Integer>();
for(String topic : document.getTopics()) {
int topicID;
if(topicIDs.containsKey(topic)) {
topicID = topicIDs.get(topic);
} else {
topicID = topicIDs.keySet().size();
topicIDs.put(topic, topicID);
topicLookup.add(topic);
}
labels.add(topicID);
}
docLabels.add(labels); //In correct position docID...
numTopicsInDocument[docID] = document.getTopics().size();
String[] tokens = document.getDocumentString().split("\\s+");
documents[docID] = new int[tokens.length];
numWordsInDocument[docID] = tokens.length;
for(int i=0; i<documents[docID].length; i++) {
//Add the token's ID to the documents array
int wordID;