/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify;
import java.util.logging.*;
import java.util.*;
import java.io.*;
import cc.mallet.classify.Classifier;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizable;
import cc.mallet.optimize.Optimizer;
import cc.mallet.optimize.tests.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labeling;
import cc.mallet.types.MatrixOps;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Vector;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import cc.mallet.util.Maths;
// Does not currently handle instances that are labeled with distributions
// instead of a single label.
* The trainer for a Maximum Entropy classifier.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
public class MCMaxEntTrainer extends ClassifierTrainer<MCMaxEnt> implements Boostable, Serializable //implements CommandOption.ListProviding
private static Logger logger = MalletLogger.getLogger(MCMaxEntTrainer.class.getName());
private static Logger progressLogger = MalletProgressMessageLogger.getLogger(MCMaxEntTrainer.class.getName()+"-pl");
int numGetValueCalls = 0;
int numGetValueGradientCalls = 0;
int numIterations = 10;
public static final String EXP_GAIN = "exp";
public static final String GRADIENT_GAIN = "grad";
public static final String INFORMATION_GAIN = "info";
// xxx Why does TestMaximizable fail when this variance is very small?
static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = .1; // note used to be 1
static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;
static final Class DEFAULT_MAXIMIZER_CLASS = LimitedMemoryBFGS.class;
boolean usingMultiConditionalTraining = true;
boolean usingHyperbolicPrior = false;
double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
Class maximizerClass = DEFAULT_MAXIMIZER_CLASS;
double generativeWeighting = 1.0;
MaximizableTrainer mt;
MCMaxEnt initialClassifier;
static CommandOption.Boolean usingMultiConditionalTrainingOption =
new CommandOption.Boolean (MCMaxEntTrainer.class, "useMCTraining", "true|false", true, true,
"Use MultiConditional Training", null);
static CommandOption.Boolean usingHyperbolicPriorOption =
new CommandOption.Boolean (MCMaxEntTrainer.class, "useHyperbolicPrior", "true|false", false, false,
"Use hyperbolic (close to L1 penalty) prior over parameters", null);
static CommandOption.Double gaussianPriorVarianceOption =
new CommandOption.Double (MCMaxEntTrainer.class, "gaussianPriorVariance", "FLOAT", true, 10.0,
"Variance of the gaussian prior over parameters", null);
static CommandOption.Double hyperbolicPriorSlopeOption =
new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSlope", "FLOAT", true, 0.2,
"Slope of the (L1 penalty) hyperbolic prior over parameters", null);
static CommandOption.Double hyperbolicPriorSharpnessOption =
new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSharpness", "FLOAT", true, 10.0,
"Sharpness of the (L1 penalty) hyperbolic prior over parameters", null);
static final CommandOption.List commandOptions =
new CommandOption.List (
"MCMaximum Entropy Classifier",
new CommandOption[] {
usingMultiConditionalTrainingOption, // CPAL
public static CommandOption.List getCommandOptionList ()
return commandOptions;
public MCMaxEntTrainer(Maximizer.ByGradient maximizer)
this.maximizerByGradient = maximizer;
this.usingHyperbolicPrior = false;
public MCMaxEntTrainer (CommandOption.List col)
this.usingHyperbolicPrior = usingHyperbolicPriorOption.value;
this.gaussianPriorVariance = gaussianPriorVarianceOption.value;
this.hyperbolicPriorSlope = hyperbolicPriorSlopeOption.value;
this.hyperbolicPriorSharpness = hyperbolicPriorSharpnessOption.value;
this.usingMultiConditionalTraining = usingMultiConditionalTrainingOption.value;
public MCMaxEntTrainer (MCMaxEnt initialClassifier) {
this.initialClassifier = initialClassifier;
public MCMaxEntTrainer ()
this (false);
public MCMaxEntTrainer (boolean useHyperbolicPrior)
this.usingHyperbolicPrior = useHyperbolicPrior;
/** Constructs a trainer with a parameter to avoid overtraining. 1.0 is
* usually a reasonable default value. */
public MCMaxEntTrainer (double gaussianPriorVariance)
this.usingHyperbolicPrior = false;
this.gaussianPriorVariance = gaussianPriorVariance;
// CPAL - added this to do MultiConditionalTraining
public MCMaxEntTrainer (double gaussianPriorVariance, boolean useMultiConditionalTraining )
this.usingHyperbolicPrior = false;
this.usingMultiConditionalTraining = useMultiConditionalTraining;
this.gaussianPriorVariance = gaussianPriorVariance;
public MCMaxEntTrainer (double hyperbolicPriorSlope,
double hyperbolicPriorSharpness)
this.usingHyperbolicPrior = true;
this.hyperbolicPriorSlope = hyperbolicPriorSlope;
this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;
public Optimizable.ByGradientValue getMaximizableTrainer (InstanceList ilist)
if (ilist == null)
return new MaximizableTrainer ();
return new MaximizableTrainer (ilist, null);
* Specifies the maximum number of iterations to run during a single call
* to <code>train</code> or <code>trainWithFeatureInduction</code>. Not
* currently functional.
* @return This trainer
// XXX Since we maximize before using numIterations, this doesn't work.
// Is that a bug? If so, should the default numIterations be higher?
public MCMaxEntTrainer setNumIterations (int i)
numIterations = i;
return this;
public MCMaxEntTrainer setUseHyperbolicPrior (boolean useHyperbolicPrior)
this.usingHyperbolicPrior = useHyperbolicPrior;
return this;
* Sets a parameter to prevent overtraining. A smaller variance for the prior
* means that feature weights are expected to hover closer to 0, so extra
* evidence is required to set a higher weight.
* @return This trainer
public MCMaxEntTrainer setGaussianPriorVariance (double gaussianPriorVariance)
this.usingHyperbolicPrior = false;
this.gaussianPriorVariance = gaussianPriorVariance;
return this;
public MCMaxEntTrainer setHyperbolicPriorSlope(double hyperbolicPriorSlope)
this.usingHyperbolicPrior = true;
this.hyperbolicPriorSlope = hyperbolicPriorSlope;
return this;
public MCMaxEntTrainer setHyperbolicPriorSharpness (double hyperbolicPriorSharpness)
this.usingHyperbolicPrior = true;
this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;
return this;
public MCMaxEnt getClassifier () {
return mt.getClassifier();
public MCMaxEnt train (InstanceList trainingSet)
logger.fine ("trainingSet.size() = "+trainingSet.size());
mt = new MaximizableTrainer (trainingSet, (MCMaxEnt)initialClassifier);
Optimizer maximizer = new LimitedMemoryBFGS(mt);
// CPAL - change the tolerance for large vocab experiments
((LimitedMemoryBFGS)maximizer).setTolerance(.00001); // std is .0001;
maximizer.optimize (); // XXX given the loop below, this seems wrong.
logger.info("MCMaxEnt ngetValueCalls:"+getValueCalls()+"\nMCMaxEnt ngetValueGradientCalls:"+getValueGradientCalls());
// boolean converged;
// for (int i = 0; i < numIterations; i++) {
// converged = maximizer.maximize (mt, 1);
// if (converged)
// break;
// else if (evaluator != null)
// if (!evaluator.evaluate (mt.getClassifier(), converged, i, mt.getValue(),
// trainingSet, validationSet, testSet))
// break;
// }
// TestMaximizable.testValueAndGradient (mt);
progressLogger.info("\n"); // progess messages are on one line; move on.
return mt.getClassifier ();
* <p>Like the other version of <code>trainWithFeatureInduction</code>, but
* allows some default options to be changed.</p>
* @param maxent An initial partially-trained classifier (default <code>null</code>).
* This classifier may be modified during training.
* @param gainName The estimate of gain (log-likelihood increase) we want our chosen
* features to maximize.
* Should be one of <code>MaxEntTrainer.EXP_GAIN</code>,
* <code>MaxEntTrainer.GRADIENT_GAIN</code>, or
* <code>MaxEntTrainer.INFORMATION_GAIN</code> (default <code>EXP_GAIN</code>).
* @return The trained <code>MaxEnt</code> classifier
public Classifier trainWithFeatureInduction (InstanceList trainingData,
InstanceList validationData,
InstanceList testingData,
ClassifierEvaluating evaluator,
MCMaxEnt maxent,
int totalIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction,
String gainName) {
// XXX This ought to be a parameter, except that setting it to true can
// crash training ("Jump too small").
boolean saveParametersDuringFI = false;
Alphabet inputAlphabet = trainingData.getDataAlphabet();
Alphabet outputAlphabet = trainingData.getTargetAlphabet();
if (maxent == null)
maxent = new MCMaxEnt(trainingData.getPipe(),
new double[(1+inputAlphabet.size()) * outputAlphabet.size()]);
int trainingIteration = 0;
int numLabels = outputAlphabet.size();
// Initialize feature selection
FeatureSelection globalFS = trainingData.getFeatureSelection();
if (globalFS == null) {
// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
globalFS = new FeatureSelection (trainingData.getDataAlphabet());
trainingData.setFeatureSelection (globalFS);
if (validationData != null) validationData.setFeatureSelection (globalFS);
if (testingData != null) testingData.setFeatureSelection (globalFS);
maxent = new MCMaxEnt(maxent.getInstancePipe(), maxent.getParameters(), globalFS);
// Run feature induction
for (int featureInductionIteration = 0;
featureInductionIteration < numFeatureInductions;
featureInductionIteration++) {
// Print out some feature information
logger.info ("Feature induction iteration "+featureInductionIteration);
// Train the model a little bit. We don't care whether it converges; we
// execute all feature induction iterations no matter what.
if (featureInductionIteration != 0) {
// Don't train until we have added some features
maxent = (MCMaxEnt)this.train (trainingData, validationData, testingData, evaluator,
trainingIteration += numIterationsBetweenFeatureInductions;
logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+
" features over "+numLabels+" labels.");
// Create the list of error tokens
InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
// This errorInstances.featureSelection will get examined by FeatureInducer,
// so it can know how to add "new" singleton features
errorInstances.setFeatureSelection (globalFS);
List errorLabelVectors = new ArrayList(); // these are length-1 vectors
for (int i = 0; i < trainingData.size(); i++) {
Instance instance = trainingData.get(i);
FeatureVector inputVector = (FeatureVector) instance.getData();
Label trueLabel = (Label) instance.getTarget();
// Having trained using just the current features, see how we classify
// the training data now.
Classification classification = maxent.classify(instance);
if (!classification.bestLabelIsCorrect()) {
errorInstances.add(inputVector, trueLabel, null, null);
logger.info ("Error instance list size = "+errorInstances.size());
int s = errorLabelVectors.size();
LabelVector[] lvs = new LabelVector[s];
for (int i = 0; i < s; i++) {
lvs[i] = (LabelVector)errorLabelVectors.get(i);
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals (EXP_GAIN))
gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
else if (gainName.equals(GRADIENT_GAIN))
gainFactory = new GradientGain.Factory (lvs);
else if (gainName.equals(INFORMATION_GAIN))
gainFactory = new InfoGain.Factory ();
throw new IllegalArgumentException("Unsupported gain name: "+gainName);
FeatureInducer klfi =
new FeatureInducer (gainFactory,
// Note that this adds features globally, but not on a per-transition basis
klfi.induceFeaturesFor (trainingData, false, false);
if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
logger.info ("MCMaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features");
klfi = null;
double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()];
// XXX (Executing this block often causes an error during training; I don't know why.)
if (saveParametersDuringFI) {
// Keep current parameter values
// XXX This relies on the implementation detail that the most recent features
// added to an Alphabet get the highest indices.
// Count parameters per output label
int oldParamCount = maxent.parameters.length / outputAlphabet.size();
int newParamCount = 1+inputAlphabet.size();
// Copy params into the proper locations
for (int i=0; i<outputAlphabet.size(); i++) {
System.arraycopy(maxent.parameters, i*oldParamCount,
newParameters, i*newParamCount,
for (int i=0; i<oldParamCount; i++)
if (maxent.parameters[i] != newParameters[i]) {
System.out.println(maxent.parameters[i]+" "+newParameters[i]);
maxent.parameters = newParameters;
maxent.defaultFeatureIndex = inputAlphabet.size();
// Finished feature induction
logger.info("Ended with "+globalFS.cardinality()+" features.");
setNumIterations(totalIterations - trainingIteration);
return this.train (trainingData, validationData, testingData,
evaluator, maxent);
// XXX Should these really be public? Why?
/** Counts how many times this trainer has computed the gradient of the
* log probability of training labels. */
public int getValueGradientCalls() {return numGetValueGradientCalls;}
/** Counts how many times this trainer has computed the
* log probability of training labels. */
public int getValueCalls() {return numGetValueCalls;}
// public int getIterations() {return maximizerByGradient.getIterations();}
public String toString()
return "MCMaxEntTrainer"
// + "("+maximizerClass.getName()+") "
+ ",numIterations=" + numIterations
+ (usingHyperbolicPrior
? (",hyperbolicPriorSlope="+hyperbolicPriorSlope+
: (",gaussianPriorVariance="+gaussianPriorVariance));
// A private inner class that wraps up a MCMaxEnt classifier and its training data.
// The result is a maximize.Maximizable function.
private class MaximizableTrainer implements Optimizable.ByGradientValue
double[] parameters, constraints, cachedGradient;
MCMaxEnt theClassifier;
InstanceList trainingList;
// The expectations are (temporarily) stored in the cachedGradient
double cachedValue;
boolean cachedValueStale;
boolean cachedGradientStale;
int numLabels;
int numFeatures;
int defaultFeatureIndex; // just for clarity
FeatureSelection featureSelection;
FeatureSelection[] perLabelFeatureSelection;
public MaximizableTrainer (){}
public MaximizableTrainer (InstanceList ilist, MCMaxEnt initialClassifier)
this.trainingList = ilist;
Alphabet fd = ilist.getDataAlphabet();
LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet();
// Don't fd.stopGrowth, because someone might want to do feature induction
// Add one feature for the "default feature".
this.numLabels = ld.size();
this.numFeatures = fd.size() + 1;
this.defaultFeatureIndex = numFeatures-1;
this.parameters = new double [numLabels * numFeatures];
this.constraints = new double [numLabels * numFeatures];
this.cachedGradient = new double [numLabels * numFeatures];
Arrays.fill (parameters, 0.0);
Arrays.fill (constraints, 0.0);
Arrays.fill (cachedGradient, 0.0);
this.featureSelection = ilist.getFeatureSelection();
this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection();
// Add the default feature index to the selection
if (featureSelection != null)
featureSelection.add (defaultFeatureIndex);
if (perLabelFeatureSelection != null)
for (int i = 0; i < perLabelFeatureSelection.length; i++)
perLabelFeatureSelection[i].add (defaultFeatureIndex);
// xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
assert (featureSelection == null || perLabelFeatureSelection == null);
if (initialClassifier != null) {
this.theClassifier = initialClassifier;
this.parameters = theClassifier.parameters;
this.featureSelection = theClassifier.featureSelection;
this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
assert (initialClassifier.getInstancePipe() == ilist.getPipe());
else if (this.theClassifier == null) {
this.theClassifier = new MCMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
cachedValueStale = true;
cachedGradientStale = true;
// Initialize the constraints
logger.fine("Number of instances in training list = " + trainingList.size());
for (Instance inst : trainingList) {
double instanceWeight = trainingList.getInstanceWeight(inst);
Labeling labeling = inst.getLabeling ();
//logger.fine ("Instance "+ii+" labeling="+labeling);
FeatureVector fv = (FeatureVector) inst.getData ();
Alphabet fdict = fv.getAlphabet();
assert (fv.getAlphabet() == fd);
int li = labeling.getBestIndex();
// The "2*" below is because there is one copy for the p(y|x)and another for the p(x|y).
MatrixOps.rowPlusEquals (constraints, numFeatures, li, fv, 2*instanceWeight);
// For the default feature, whose weight is 1.0
assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";
assert(!Double.isNaN(li)) : "bestIndex is NaN";
boolean hasNaN = false;
for(int i = 0; i < fv.numLocations(); i++) {
if(Double.isNaN(fv.valueAtLocation(i))) {
logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
hasNaN = true;
logger.info("NaN in instance: " + inst.getName());
// Only p(y|x) uses the default feature; p(x|y) doesn't use it. The default feature value is 1.0.
constraints[li*numFeatures + defaultFeatureIndex] += instanceWeight;
//TestMaximizable.testValueAndGradientCurrentParameters (this);
public MCMaxEnt getClassifier () { return theClassifier; }
public double getParameter (int index) {
return parameters[index];
public void setParameter (int index, double v) {
cachedValueStale = true;
cachedGradientStale = true;
parameters[index] = v;
public int getNumParameters() {
return parameters.length;
public void getParameters (double[] buff) {
if (buff == null || buff.length != parameters.length)
buff = new double [parameters.length];
System.arraycopy (parameters, 0, buff, 0, parameters.length);
public void setParameters (double [] buff) {
assert (buff != null);
cachedValueStale = true;
cachedGradientStale = true;
if (buff.length != parameters.length)
parameters = new double[buff.length];
System.arraycopy (buff, 0, parameters, 0, buff.length);
// log probability of the training labels
public double getValue ()
if (cachedValueStale) {
cachedValue = 0;
// We'll store the expectation values in "cachedGradient" for now
cachedGradientStale = true;
java.util.Arrays.fill (cachedGradient, 0.0);
// Incorporate likelihood of data
double[] scores = new double[trainingList.getTargetAlphabet().size()];
double value = 0.0;
//System.out.println("I Now "+inputAlphabet.size()+" regular features.");
Iterator<Instance> iter = trainingList.iterator();
//int ii = 0;
// Normalize the parameters to be per-class multinomials
double probs[][] = new double[scores.length][numFeatures];
double lprobs[][] = new double[scores.length][numFeatures];
for (int si = 0; si < scores.length; si++) {
double sum = 0, max = MatrixOps.max (parameters);
for (int fi = 0; fi < numFeatures; fi++) {
// TODO Strongly consider some smoothing here. What happens when all parameters are zero?
// Oh, this should be no problem, because exp(0) == 1.
probs[si][fi] = Math.exp(parameters[si*numFeatures+fi] - max);
sum += probs[si][fi];
assert (sum > 0);
for (int fi = 0; fi < numFeatures; fi++) {
probs[si][fi] /= sum;
lprobs[si][fi] = Math.log(probs[si][fi]);
while (iter.hasNext()) {
Instance instance = iter.next();
double instanceWeight = trainingList.getInstanceWeight(instance);
Labeling labeling = instance.getLabeling ();
//System.out.println("L Now "+inputAlphabet.size()+" regular features.");
this.theClassifier.getClassificationScores (instance, scores);
FeatureVector fv = (FeatureVector) instance.getData ();
int li = labeling.getBestIndex();
value = - (instanceWeight * Math.log (scores[li]));
if(Double.isNaN(value)) {
logger.fine ("MCMaxEntTrainer: Instance " + instance.getName() +
"has NaN value. log(scores)= " + Math.log(scores[li]) +
" scores = " + scores[li] +
" has instance weight = " + instanceWeight);
if (Double.isInfinite(value)) {
logger.warning ("Instance "+instance.getSource() + " has infinite value; skipping value and gradient");
cachedValue -= value;
cachedValueStale = false;
return -value;
// continue;
cachedValue += value;
// CPAL - this is a loop over classes and their scores
// - we compute the gradient by taking the dot product of the feature value
// and the probability of the class
for (int si = 0; si < scores.length; si++) {
if (scores[si] == 0) continue;
assert (!Double.isInfinite(scores[si]));
// CPAL - accumulating the current classifiers expectation of the feature
// vector counts for this class label
// Current classifier has expectation over class label, not over feature vector
MatrixOps.rowPlusEquals (cachedGradient, numFeatures,
si, fv, -instanceWeight * scores[si]);
cachedGradient[numFeatures*si + defaultFeatureIndex] += (-instanceWeight * scores[si]);
// CPAL - if we wish to do multiconditional training we need another term for this accumulated
// expectation
if (usingMultiConditionalTraining) {
// need something analogous to this
// this.theClassifier.getClassificationScores (instance, scores);
// this.theClassifier.getFeatureDistributions (instance,
// Note: li is the "label" for this instance
// Get the sum of the feature vector
// which is the number of counts for the document if we use that as input
double Ncounts = MatrixOps.sum(fv);
// CPAL - get the additional term for the value of our - log probability
// - this computation amounts to the dot product of the feature vector and the probability vector
cachedValue -= (instanceWeight * fv.dotProduct(lprobs[li]));
// CPAL - get the model expectation over features for the given class
for (int fi = 0; fi < numFeatures; fi++) {
//if(parameters[numFeatures*li + fi] != 0) {
// MatrixOps.rowPlusEquals(cachedGradient, numFeatures,li,fv,))
cachedGradient[numFeatures*li + fi] += (-instanceWeight * Ncounts * probs[li][fi]);
// }
//logger.info ("-Expectations:"); cachedGradient.print();
// Incorporate prior on parameters
if (usingHyperbolicPrior) {
for (int li = 0; li < numLabels; li++)
for (int fi = 0; fi < numFeatures; fi++)
cachedValue += (hyperbolicPriorSlope / hyperbolicPriorSharpness
* Math.log (Maths.cosh (hyperbolicPriorSharpness * parameters[li *numFeatures + fi])));
} else {
for (int li = 0; li < numLabels; li++)
for (int fi = 0; fi < numFeatures; fi++) {
double param = parameters[li*numFeatures + fi];
cachedValue += param * param / (2 * gaussianPriorVariance);
cachedValue *= -1.0; // MAXIMIZE, NOT MINIMIZE
cachedValueStale = false;
progressLogger.info ("Value (loglikelihood) = "+cachedValue);
return cachedValue;
// CPAL first get value, then gradient
public void getValueGradient (double [] buffer)
// Gradient is (constraint - expectation - parameters/gaussianPriorVariance)
if (cachedGradientStale) {
if (cachedValueStale)
// This will fill in the cachedGradient with the "-expectation"
getValue ();
// cachedGradient contains the negative expectations
// expectations are model expectations and constraints are
// empirical expectations
MatrixOps.plusEquals (cachedGradient, constraints);
// CPAL - we need a second copy of the constraints
// - actually, we only want this for the feature values
// - I've moved this up into getValue
//if (usingMultiConditionalTraining){
// MatrixOps.plusEquals(cachedGradient, constraints);
// Incorporate prior on parameters
if (usingHyperbolicPrior) {
throw new UnsupportedOperationException ("Hyperbolic prior not yet implemented.");
else {
MatrixOps.plusEquals (cachedGradient, parameters,
-1.0 / gaussianPriorVariance);
// A parameter may be set to -infinity by an external user.
// We set gradient to 0 because the parameter's value can
// never change anyway and it will mess up future calculations
// on the matrix, such as norm().
MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);
// Set to zero all the gradient dimensions that are not among the selected features
if (perLabelFeatureSelection == null) {
for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
MatrixOps.rowSetAll (cachedGradient, numFeatures,
labelIndex, 0.0, featureSelection, false);
} else {
for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
MatrixOps.rowSetAll (cachedGradient, numFeatures,
labelIndex, 0.0,
perLabelFeatureSelection[labelIndex], false);
cachedGradientStale = false;
assert (buffer != null && buffer.length == parameters.length);
System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
public double sumNegLogProb (double a, double b)
else if (a > b)
return b - Math.log (1 + Math.exp(b-a));
return a - Math.log (1 + Math.exp(a-b));