package cc.mallet.topics;
/** This class implements the value and gradient functions for
* Dirichlet-multinomial Regression. See Guimaraes and Lindrooth,
* for a general introduction to DMR,
* and Mimno and McCallum (UAI, 2008) for an application to
* multinomial mixture models.
*/
import cc.mallet.optimize.Optimizable;
import cc.mallet.classify.MaxEnt;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Instance;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Dirichlet;
import cc.mallet.types.MatrixOps;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import java.util.logging.*;
import java.util.*;
import java.text.NumberFormat;
import java.text.DecimalFormat;
import gnu.trove.TIntIntHashMap;
public class DMROptimizable implements Optimizable.ByGradientValue {
private static Logger logger = MalletLogger.getLogger(DMROptimizable.class.getName());
private static Logger progressLogger = MalletProgressMessageLogger.getLogger(DMROptimizable.class.getName()+"-pl");
MaxEnt classifier;
InstanceList trainingList;
int numGetValueCalls = 0;
int numGetValueGradientCalls = 0;
int numIterations = Integer.MAX_VALUE;
NumberFormat formatter = null;
static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1;
static final double DEFAULT_LARGE_GAUSSIAN_PRIOR_VARIANCE = 100;
static final double DEFAULT_GAUSSIAN_PRIOR_MEAN = 0.0;
double gaussianPriorMean = DEFAULT_GAUSSIAN_PRIOR_MEAN;
double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
// Allowing the default feature (the base level) to
// fluctuate more freely than the feature parameters leads
// to much better results.
double defaultFeatureGaussianPriorVariance = DEFAULT_LARGE_GAUSSIAN_PRIOR_VARIANCE;
double[] parameters;
double[] cachedGradient;
double cachedValue;
boolean cachedValueStale;
boolean cachedGradientStale;
int numLabels;
int numFeatures;
int defaultFeatureIndex;
public DMROptimizable () {}
public DMROptimizable (InstanceList instances, MaxEnt initialClassifier) {
this.trainingList = instances;
Alphabet alphabet = instances.getDataAlphabet();
Alphabet labelAlphabet = instances.getTargetAlphabet();
this.numLabels = labelAlphabet.size();
// Add one feature for the "default feature".
this.numFeatures = alphabet.size() + 1; // add a spot for the intercept term
//System.out.println("num features: " + numFeatures + " numLabels: " + numLabels);
this.defaultFeatureIndex = numFeatures - 1;
this.parameters = new double [numLabels * numFeatures];
//this.constraints = new double [numLabels * numFeatures];
this.cachedGradient = new double [numLabels * numFeatures];
if (initialClassifier != null) {
this.classifier = initialClassifier;
this.parameters = classifier.getParameters();
this.defaultFeatureIndex = classifier.getDefaultFeatureIndex();
assert (initialClassifier.getInstancePipe() == instances.getPipe());
}
else if (this.classifier == null) {
this.classifier =
new MaxEnt (instances.getPipe(), parameters);
}
formatter = new DecimalFormat("0.###E0");
cachedValueStale = true;
cachedGradientStale = true;
// Initialize the constraints
logger.fine("Number of instances in training list = " + trainingList.size());
for (Instance instance : trainingList) {
FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
if (multinomialValues == null)
continue;
FeatureVector features = (FeatureVector) instance.getData();
assert (features.getAlphabet() == alphabet);
boolean hasNaN = false;
for (int i = 0; i < features.numLocations(); i++) {
if (Double.isNaN(features.valueAtLocation(i))) {
logger.info("NaN for feature " + alphabet.lookupObject(features.indexAtLocation(i)).toString());
hasNaN = true;
}
}
if (hasNaN) {
logger.info("NaN in instance: " + instance.getName());
}
}
//TestMaximizable.testValueAndGradientCurrentParameters (this);
}
/** Set the variance for the default features (aka intercept terms), generally
* larger than the variance for the regular features.
*/
public void setInterceptGaussianPriorVariance(double sigmaSquared) {
this.defaultFeatureGaussianPriorVariance = sigmaSquared;
}
/** Set the variance for regular (non default) features, generally
* smaller than the variance for the default features.
*/
public void setRegularGaussianPriorVariance(double sigmaSquared) {
this.gaussianPriorVariance = sigmaSquared;
}
public MaxEnt getClassifier () { return classifier; }
public double getParameter (int index) {
return parameters[index];
}
public void setParameter (int index, double v) {
cachedValueStale = true;
cachedGradientStale = true;
parameters[index] = v;
}
public int getNumParameters() {
return parameters.length;
}
public void getParameters (double[] buff) {
if (buff == null || buff.length != parameters.length) {
buff = new double [parameters.length];
}
System.arraycopy (parameters, 0, buff, 0, parameters.length);
}
public void setParameters (double [] buff) {
assert (buff != null);
cachedValueStale = true;
cachedGradientStale = true;
if (buff.length != parameters.length)
parameters = new double[buff.length];
System.arraycopy (buff, 0, parameters, 0, buff.length);
}
/** The log probability of the observed count vectors given the features. */
public double getValue () {
if (! cachedValueStale) { return cachedValue; }
numGetValueCalls++;
cachedValue = 0;
// Incorporate likelihood of data
double[] scores = new double[ trainingList.getTargetAlphabet().size() ];
double value = 0.0;
int instanceIndex = 0;
for (Instance instance: trainingList) {
FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
if (multinomialValues == null) { continue; }
//System.out.println("L Now "+inputAlphabet.size()+" regular features.");
// Get the predicted probability of each class
// under the current model parameters
this.classifier.getUnnormalizedClassificationScores(instance, scores);
double sumScores = 0.0;
// Exponentiate the scores
for (int i=0; i<scores.length; i++) {
// Due to underflow, it's very likely that some of these scores will be 0.0.
scores[i] = Math.exp(scores[i]);
sumScores += scores[i];
}
FeatureVector features = (FeatureVector) instance.getData();
// This is really an int, but since FeatureVectors are defined as doubles,
// avoid casting.
double totalLength = 0;
for (int i = 0; i < multinomialValues.numLocations(); i++) {
int label = multinomialValues.indexAtLocation(i);
double count = multinomialValues.valueAtLocation(i);
value += (Dirichlet.logGammaStirling(scores[label] + count) -
Dirichlet.logGammaStirling(scores[label]));
totalLength += count;
}
value -= (Dirichlet.logGammaStirling(sumScores + totalLength) -
Dirichlet.logGammaStirling(sumScores));
// Error Checking:
if (Double.isNaN(value)) {
logger.fine ("DCMMaxEntTrainer: Instance " + instance.getName() +
"has NaN value.");
for (int label: multinomialValues.getIndices()) {
logger.fine ("log(scores)= " + Math.log(scores[label]) +
" scores = " + scores[label]);
}
}
if (Double.isInfinite(value)) {
logger.warning ("Instance " + instance.getSource() +
" has infinite value; skipping value and gradient");
cachedValue -= value;
cachedValueStale = false;
return -value;
}
//System.out.println(value);
cachedValue += value;
instanceIndex++;
}
// Incorporate prior on parameters
double prior = 0;
// The log of a gaussian prior is x^2 / -2sigma^2
for (int label = 0; label < numLabels; label++) {
for (int feature = 0; feature < numFeatures - 1; feature++) {
double param = parameters[label*numFeatures + feature];
prior -= (param - gaussianPriorMean) * (param - gaussianPriorMean) / (2 * gaussianPriorVariance);
}
double param = parameters[label*numFeatures + defaultFeatureIndex];
prior -= (param - gaussianPriorMean) * (param - gaussianPriorMean) /
(2 * defaultFeatureGaussianPriorVariance);
}
double labelProbability = cachedValue;
cachedValue += prior;
cachedValueStale = false;
progressLogger.info ("Value (likelihood=" + formatter.format(labelProbability) +
" prior=" + formatter.format(prior) +
") = " + formatter.format(cachedValue));
return cachedValue;
}
public void getValueGradient (double [] buffer) {
MatrixOps.setAll (cachedGradient, 0.0);
// Incorporate likelihood of data
double[] scores = new double[ trainingList.getTargetAlphabet().size() ];
int instanceIndex = 0;
for (Instance instance: trainingList) {
FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
if (multinomialValues == null) { continue; }
// Get the predicted probability of each class
// under the current model parameters
this.classifier.getUnnormalizedClassificationScores(instance, scores);
double sumScores = 0.0;
// Exponentiate the scores
for (int i=0; i<scores.length; i++) {
// Due to underflow, it's very likely that some of these scores will be 0.0.
scores[i] = Math.exp(scores[i]);
sumScores += scores[i];
}
FeatureVector features = (FeatureVector) instance.getData();
double totalLength = 0;
for (double count : multinomialValues.getValues()) {
totalLength += count;
}
double digammaDifferenceForSums =
Dirichlet.digamma(sumScores + totalLength) -
Dirichlet.digamma(sumScores);
for (int loc = 0; loc < features.numLocations(); loc++) {
int index = features.indexAtLocation(loc);
double value = features.valueAtLocation(loc);
if (value == 0.0) { continue; }
// In a FeatureVector, there's no easy way to say "do you know
// about this id?" so I've broken this into two for loops,
// one for all labels, the other for just the non-zero ones.
for (int label=0; label<numLabels; label++) {
cachedGradient[label * numFeatures + index] -=
value * scores[label] * digammaDifferenceForSums;
}
for (int labelLoc = 0; labelLoc <multinomialValues.numLocations(); labelLoc++) {
int label = multinomialValues.indexAtLocation(labelLoc);
double count = multinomialValues.valueAtLocation(labelLoc);
double diff = 0.0;
if (count < 20) {
for (int i=0; i < count; i++) {
diff += 1 / (scores[label] + i);
}
}
else {
diff = Dirichlet.digamma(scores[label] + count) -
Dirichlet.digamma(scores[label]);
}
cachedGradient[label * numFeatures + index] +=
value * scores[label] * diff;
}
}
// Now add the default feature
for (int label=0; label<numLabels; label++) {
cachedGradient[label * numFeatures + defaultFeatureIndex] -=
scores[label] * digammaDifferenceForSums;
}
for(int labelLoc = 0; labelLoc <multinomialValues.numLocations(); labelLoc++) {
int label = multinomialValues.indexAtLocation(labelLoc);
double count = multinomialValues.valueAtLocation(labelLoc);
double diff = 0.0;
if (count < 20) {
for (int i=0; i < count; i++) {
diff += 1 / (scores[label] + i);
}
}
else {
diff = Dirichlet.digamma(scores[label] + count) -
Dirichlet.digamma(scores[label]);
}
cachedGradient[label * numFeatures + defaultFeatureIndex] +=
scores[label] * diff;
}
}
numGetValueGradientCalls++;
for (int label = 0; label < numLabels; label++) {
for (int feature = 0; feature < numFeatures - 1; feature++) {
double param = parameters[label*numFeatures + feature];
cachedGradient[label * numFeatures + feature] -=
(param - gaussianPriorMean) / gaussianPriorVariance;
}
double param = parameters[label*numFeatures + defaultFeatureIndex];
cachedGradient[label * numFeatures + defaultFeatureIndex] -=
(param - gaussianPriorMean) / defaultFeatureGaussianPriorVariance;
}
// A parameter may be set to -infinity by an external user.
// We set gradient to 0 because the parameter's value can
// never change anyway and it will mess up future calculations
// on the matrix, such as norm().
MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);
assert (buffer != null && buffer.length == parameters.length);
System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
//System.out.println ("DCMMaxEntTrainer gradient infinity norm = "+MatrixOps.infinityNorm(cachedGradient));
}
}