/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify;
import java.util.logging.*;
import java.util.*;
import java.io.*;
import cc.mallet.classify.Classifier;
import cc.mallet.optimize.ConjugateGradient;
import cc.mallet.optimize.InvalidOptimizableException;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizable;
import cc.mallet.optimize.OptimizationException;
import cc.mallet.optimize.Optimizer;
import cc.mallet.optimize.OrthantWiseLimitedMemoryBFGS;
import cc.mallet.optimize.tests.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labeling;
import cc.mallet.types.MatrixOps;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Vector;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import cc.mallet.util.Maths;
//Does not currently handle instances that are labeled with distributions
//instead of a single label.
/**
* The trainer for a Maximum Entropy classifier.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
public class MaxEntTrainer extends ClassifierTrainer<MaxEnt>
implements ClassifierTrainer.ByOptimization<MaxEnt>, Boostable, Serializable {
private static Logger logger = MalletLogger.getLogger(MaxEntTrainer.class.getName());
private static Logger progressLogger = MalletProgressMessageLogger.getLogger(MaxEntTrainer.class.getName()+"-pl");
int numIterations = Integer.MAX_VALUE;
//public static final String EXP_GAIN = "exp";
//public static final String GRADIENT_GAIN = "grad";
//public static final String INFORMATION_GAIN = "info";
// xxx Why does TestMaximizable fail when this variance is very small?
static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1;
static final double DEFAULT_L1_WEIGHT = 0.0;
static final Class DEFAULT_MAXIMIZER_CLASS = LimitedMemoryBFGS.class;
double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
double l1Weight = DEFAULT_L1_WEIGHT;
Class maximizerClass = DEFAULT_MAXIMIZER_CLASS;
InstanceList trainingSet = null;
MaxEnt initialClassifier;
MaxEntOptimizableByLabelLikelihood optimizable = null;
Optimizer optimizer = null;
//
// CONSTRUCTORS
//
public MaxEntTrainer () {}
/** Construct a MaxEnt trainer using a trained classifier as
* initial values.
*/
public MaxEntTrainer (MaxEnt theClassifierToTrain) {
this.initialClassifier = theClassifierToTrain;
}
/** Constructs a trainer with a parameter to avoid overtraining. 1.0 is
* the default value. */
public MaxEntTrainer (double gaussianPriorVariance) {
this.gaussianPriorVariance = gaussianPriorVariance;
}
//
// CLASSIFIER OBJECT: stores parameters
//
public MaxEnt getClassifier () {
if (optimizable != null)
return optimizable.getClassifier();
return initialClassifier;
}
/**
* Initialize parameters using the provided classifier.
*/
public void setClassifier (MaxEnt theClassifierToTrain) {
// Is this necessary? What is the caller is about to set the training set to something different? -akm
assert (trainingSet == null || Alphabet.alphabetsMatch(theClassifierToTrain, trainingSet));
if (this.initialClassifier != theClassifierToTrain) {
this.initialClassifier = theClassifierToTrain;
optimizable = null;
optimizer = null;
}
}
//
// OPTIMIZABLE OBJECT: implements value and gradient functions
//
public Optimizable getOptimizable () {
return optimizable;
}
public MaxEntOptimizableByLabelLikelihood getOptimizable (InstanceList trainingSet) {
return getOptimizable(trainingSet, getClassifier());
}
public MaxEntOptimizableByLabelLikelihood getOptimizable (InstanceList trainingSet, MaxEnt initialClassifier) {
if (trainingSet != this.trainingSet || this.initialClassifier != initialClassifier) {
this.trainingSet = trainingSet;
this.initialClassifier = initialClassifier;
if (optimizable == null || optimizable.trainingList != trainingSet) {
optimizable = new MaxEntOptimizableByLabelLikelihood (trainingSet, initialClassifier);
if (l1Weight == 0.0) {
optimizable.setGaussianPriorVariance(gaussianPriorVariance);
}
else {
// the prior term for L1-regularized classifiers
// is implemented as part of the optimizer,
// so don't include a prior calculation in the value and
// gradient functions.
optimizable.useNoPrior();
}
optimizer = null;
}
}
return optimizable;
}
//
// OPTIMIZER OBJECT: maximizes value function
//
public Optimizer getOptimizer () {
if (optimizer == null && optimizable != null) {
optimizer = new ConjugateGradient(optimizable);
}
return optimizer;
}
/** This method is called by the train method.
* This is the main entry point for the optimizable and optimizer
* compontents.
*/
public Optimizer getOptimizer (InstanceList trainingSet) {
// If the data is not set, or has changed,
// initialize the optimizable object and
// replace the optimizer.
if (trainingSet != this.trainingSet ||
optimizable == null) {
getOptimizable(trainingSet);
optimizer = null;
}
// Build a new optimizer
if (optimizer == null) {
// If l1Weight is 0, this devolves to
// standard L-BFGS, but the implementation
// may be faster.
optimizer = new LimitedMemoryBFGS(optimizable);
//OrthantWiseLimitedMemoryBFGS(optimizable, l1Weight);
}
return optimizer;
}
/**
* Specifies the maximum number of iterations to run during a single call
* to <code>train</code> or <code>trainWithFeatureInduction</code>. Not
* currently functional.
* @return This trainer
*/
// XXX Since we maximize before using numIterations, this doesn't work.
// Is that a bug? If so, should the default numIterations be higher?
public MaxEntTrainer setNumIterations (int i) {
numIterations = i;
return this;
}
public int getIteration () {
if (optimizable == null)
return 0;
else
return Integer.MAX_VALUE;
// return optimizer.getIteration ();
}
/**
* Sets a parameter to prevent overtraining. A smaller variance for the prior
* means that feature weights are expected to hover closer to 0, so extra
* evidence is required to set a higher weight.
* @return This trainer
*/
public MaxEntTrainer setGaussianPriorVariance (double gaussianPriorVariance) {
this.gaussianPriorVariance = gaussianPriorVariance;
return this;
}
/**
* Use an L1 prior. Larger values mean parameters will be closer to 0.
* Note that this setting overrides any Gaussian prior.
*/
public MaxEntTrainer setL1Weight(double l1Weight) {
this.l1Weight = l1Weight;
return this;
}
public MaxEnt train (InstanceList trainingSet) {
return train (trainingSet, numIterations);
}
public MaxEnt train (InstanceList trainingSet, int numIterations)
{
logger.fine ("trainingSet.size() = "+trainingSet.size());
getOptimizer (trainingSet); // This will set this.optimizer, this.optimizable
for (int i = 0; i < numIterations; i++) {
try {
finishedTraining = optimizer.optimize (1);
} catch (InvalidOptimizableException e) {
e.printStackTrace();
logger.warning("Catching InvalidOptimizatinException! saying converged.");
finishedTraining = true;
} catch (OptimizationException e) {
e.printStackTrace();
logger.info ("Catching OptimizationException; saying converged.");
finishedTraining = true;
}
if (finishedTraining)
break;
}
// only if any number of iterations is allowed
if (numIterations == Integer.MAX_VALUE) {
// Run it again because in our and Sam Roweis' experience, BFGS can still
// eke out more likelihood after first convergence by re-running without
// being restricted by its gradient history.
optimizer = null;
getOptimizer(trainingSet);
try {
finishedTraining = optimizer.optimize ();
} catch (InvalidOptimizableException e) {
e.printStackTrace();
logger.warning("Catching InvalidOptimizatinException! saying converged.");
finishedTraining = true;
} catch (OptimizationException e) {
e.printStackTrace();
logger.info ("Catching OptimizationException; saying converged.");
finishedTraining = true;
}
}
//TestMaximizable.testValueAndGradientCurrentParameters (mt);
progressLogger.info("\n"); // progress messages are on one line; move on.
//logger.info("MaxEnt ngetValueCalls:"+getValueCalls()+"\nMaxEnt ngetValueGradientCalls:"+getValueGradientCalls());
return optimizable.getClassifier();
}
/**
* <p>Trains a maximum entropy model using feature selection and feature induction
* (adding conjunctions of features as new features).</p>
*
* @param trainingData A list of <code>Instance</code>s whose <code>data</code>
* fields are binary, augmentable <code>FeatureVector</code>s.
* and whose <code>target</code> fields are <code>Label</code>s.
* @param validationData [not currently used] As <code>trainingData</code>,
* or <code>null</code>.
* @param testingData As <code>trainingData</code>, or <code>null</code>.
* @param evaluator The evaluator to track training progress and decide whether
* to continue, or <code>null</code>.
* @param totalIterations The maximum total number of training iterations,
* including those taken during feature induction.
* @param numIterationsBetweenFeatureInductions How many iterations to train
* between one round of feature induction and the next; this should usually
* be fairly small, like 5 or 10, to avoid overfitting with current features.
* @param numFeatureInductions How many rounds of feature induction to run
* before beginning normal training.
* @param numFeaturesPerFeatureInduction The maximum number of features to
* choose during each round of featureInduction.
*
* @return The trained <code>MaxEnt</code> classifier
*/
/*
// added - cjmaloof@linc.cis.upenn.edu
public Classifier trainWithFeatureInduction (InstanceList trainingData,
int totalIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction) {
return trainWithFeatureInduction (trainingData,
null,
totalIterations,
numIterationsBetweenFeatureInductions,
numFeatureInductions,
numFeaturesPerFeatureInduction,
EXP_GAIN);
}
*/
/**
* <p>Like the other version of <code>trainWithFeatureInduction</code>, but
* allows some default options to be changed.</p>
*
* @param maxent An initial partially-trained classifier (default <code>null</code>).
* This classifier may be modified during training.
* @param gainName The estimate of gain (log-likelihood increase) we want our chosen
* features to maximize.
* Should be one of <code>MaxEntTrainer.EXP_GAIN</code>,
* <code>MaxEntTrainer.GRADIENT_GAIN</code>, or
* <code>MaxEntTrainer.INFORMATION_GAIN</code> (default <code>EXP_GAIN</code>).
*
* @return The trained <code>MaxEnt</code> classifier
*/
/* // Temporarily removed until I figure out how to handle induceFeaturesFor (testData)
public Classifier trainWithFeatureInduction (InstanceList trainingData,
int totalIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction,
String gainName) {
// XXX This ought to be a parameter, except that setting it to true can
// crash training ("Jump too small").
boolean saveParametersDuringFI = false;
Alphabet inputAlphabet = trainingData.getDataAlphabet();
Alphabet outputAlphabet = trainingData.getTargetAlphabet();
int trainingIteration = 0;
int numLabels = outputAlphabet.size();
MaxEnt maxent = getClassifier();
// Initialize feature selection
FeatureSelection globalFS = trainingData.getFeatureSelection();
if (globalFS == null) {
// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
globalFS = new FeatureSelection (trainingData.getDataAlphabet());
trainingData.setFeatureSelection (globalFS);
}
//if (validationData != null) validationData.setFeatureSelection (globalFS);
//if (testingData != null) testingData.setFeatureSelection (globalFS);
getOptimizer(trainingData); // This will initialize this.me so getClassifier() below works
maxent.setFeatureSelection(globalFS);
// Run feature induction
for (int featureInductionIteration = 0; featureInductionIteration < numFeatureInductions; featureInductionIteration++) {
// Print out some feature information
logger.info ("Feature induction iteration "+featureInductionIteration);
// Train the model a little bit. We don't care whether it converges; we
// execute all feature induction iterations no matter what.
if (featureInductionIteration != 0) {
// Don't train until we have added some features
setNumIterations(numIterationsBetweenFeatureInductions);
train (trainingData);
}
trainingIteration += numIterationsBetweenFeatureInductions;
logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+
" features over "+numLabels+" labels.");
// Create the list of error tokens
InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
trainingData.getTargetAlphabet());
// This errorInstances.featureSelection will get examined by FeatureInducer,
// so it can know how to add "new" singleton features
errorInstances.setFeatureSelection (globalFS);
List errorLabelVectors = new ArrayList(); // these are length-1 vectors
for (int i = 0; i < trainingData.size(); i++) {
Instance instance = trainingData.get(i);
FeatureVector inputVector = (FeatureVector) instance.getData();
Label trueLabel = (Label) instance.getTarget();
// Having trained using just the current features, see how we classify
// the training data now.
Classification classification = maxent.classify(instance);
if (!classification.bestLabelIsCorrect()) {
errorInstances.add(inputVector, trueLabel, null, null);
errorLabelVectors.add(classification.getLabelVector());
}
}
logger.info ("Error instance list size = "+errorInstances.size());
int s = errorLabelVectors.size();
LabelVector[] lvs = new LabelVector[s];
for (int i = 0; i < s; i++) {
lvs[i] = (LabelVector)errorLabelVectors.get(i);
}
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals (EXP_GAIN))
gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
else if (gainName.equals(GRADIENT_GAIN))
gainFactory = new GradientGain.Factory (lvs);
else if (gainName.equals(INFORMATION_GAIN))
gainFactory = new InfoGain.Factory ();
else
throw new IllegalArgumentException("Unsupported gain name: "+gainName);
FeatureInducer klfi =
new FeatureInducer (gainFactory,
errorInstances,
numFeaturesPerFeatureInduction,
2*numFeaturesPerFeatureInduction,
2*numFeaturesPerFeatureInduction);
// Note that this adds features globally, but not on a per-transition basis
klfi.induceFeaturesFor (trainingData, false, false);
if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
logger.info ("MaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features");
klfi = null;
double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()];
// XXX (Executing this block often causes an error during training; I don't know why.)
if (saveParametersDuringFI) {
// Keep current parameter values
// XXX This relies on the implementation detail that the most recent features
// added to an Alphabet get the highest indices.
// Count parameters per output label
int oldParamCount = maxent.parameters.length / outputAlphabet.size();
int newParamCount = 1+inputAlphabet.size();
// Copy params into the proper locations
for (int i=0; i<outputAlphabet.size(); i++) {
System.arraycopy(maxent.parameters, i*oldParamCount,
newParameters, i*newParamCount,
oldParamCount);
}
for (int i=0; i<oldParamCount; i++)
if (maxent.parameters[i] != newParameters[i]) {
System.out.println(maxent.parameters[i]+" "+newParameters[i]);
System.exit(0);
}
}
maxent.parameters = newParameters;
maxent.defaultFeatureIndex = inputAlphabet.size();
}
// Finished feature induction
logger.info("Ended with "+globalFS.cardinality()+" features.");
setNumIterations(totalIterations - trainingIteration);
train (trainingData);
return maxent;
}
*/
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("MaxEntTrainer");
if (numIterations < Integer.MAX_VALUE) {
builder.append(",numIterations=" + numIterations);
}
if (l1Weight != 0.0) {
builder.append(",l1Weight=" + l1Weight);
}
else {
builder.append(",gaussianPriorVariance=" + gaussianPriorVariance);
}
return builder.toString();
}
}