double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY));
Path dir = new Path(statePath);
// TODO scalability bottleneck: numWords * numTopics * 8bytes for the driver *and* M/R classes
DenseMatrix pWgT = new DenseMatrix(numTopics, numWords);
double[] logTotals = new double[numTopics];
Arrays.fill(logTotals, Double.NEGATIVE_INFINITY);
double ll = 0.0;
if (empty) {
return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll);
}
for (Pair<IntPairWritable,DoubleWritable> record
: new SequenceFileDirIterable<IntPairWritable, DoubleWritable>(new Path(dir, "part-*"),
PathType.GLOB,
null,
null,
true,
job)) {
IntPairWritable key = record.getFirst();
DoubleWritable value = record.getSecond();
int topic = key.getFirst();
int word = key.getSecond();
if (word == TOPIC_SUM_KEY) {
logTotals[topic] = value.get();
Preconditions.checkArgument(!Double.isInfinite(value.get()));
} else if (topic == LOG_LIKELIHOOD_KEY) {
ll = value.get();
} else {
Preconditions.checkArgument(topic >= 0, "topic should be non-negative, not %d", topic);
Preconditions.checkArgument(word >= 0, "word should be non-negative not %d", word);
Preconditions.checkArgument(pWgT.getQuick(topic, word) == 0.0);
pWgT.setQuick(topic, word, value.get());
Preconditions.checkArgument(!Double.isInfinite(pWgT.getQuick(topic, word)));
}
}
return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll);
}