package cc.mallet.fst;
import java.util.ArrayList;
import java.util.Random;
import java.util.logging.Logger;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizer;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.LabelVector;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Sequence;
import cc.mallet.util.MalletLogger;
* Unlike ClassifierTrainer, TransducerTrainer is not "stateless" between calls
* to train. A TransducerTrainer is constructed paired with a specific
* Transducer, and can only train that Transducer. CRF stores and has methods
* for FeatureSelection and weight freezing. CRFTrainer stores and has methods
* for determining the contents/dimensions/sparsity/FeatureInduction of the
* CRF's weights as determined by training data.
* <p>
* <b>Note:</b> In the future this class may go away in favor of some default
* version of CRFTrainerByValueGradients.
public class CRFTrainerByLabelLikelihood extends TransducerTrainer implements TransducerTrainer.ByOptimization {
private static Logger logger = MalletLogger.getLogger(CRFTrainerByLabelLikelihood.class.getName());
static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1.0;
static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;
CRF crf;
//OptimizableCRF ocrf;
CRFOptimizableByLabelLikelihood ocrf;
Optimizer opt;
int iterationCount = 0;
boolean converged;
boolean usingHyperbolicPrior = false;
double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
boolean useSparseWeights = true;
boolean useNoWeights = false; // TODO remove this; it is just for debugging
private transient boolean useSomeUnsupportedTrick = true;
// Various values from CRF acting as indicators of when we need to ...
private int cachedValueWeightsStamp = -1; // ... re-calculate expectations and values to getValue() because weights' values changed
private int cachedGradientWeightsStamp = -1; // ... re-calculate to getValueGradient() because weights' values changed
private int cachedWeightsStructureStamp = -1; // ... re-allocate crf.weights, expectations & constraints because new states, transitions
// Use mcrf.trainingSet to see when we need to re-allocate crf.weights, expectations & constraints because we are using a different TrainingList than last time
// xxx temporary hack. This is quite useful to have, though!! -cas
public boolean printGradient = false;
public CRFTrainerByLabelLikelihood (CRF crf) {
this.crf = crf;
public Transducer getTransducer() { return crf; }
public CRF getCRF () { return crf; }
public Optimizer getOptimizer() { return opt; }
public boolean isConverged() { return converged; }
public boolean isFinishedTraining() { return converged; }
public int getIteration () { return iterationCount; }
* Use this method to specify whether or not factors
* are added to the CRF by this trainer. If you have
* already setup the factors in your CRF, you may
* not want the trainer to add additional factors.
* @param flag If true, this trainer adds no factors to the CRF.
public void setAddNoFactors(boolean flag) {
this.useNoWeights = flag;
public CRFOptimizableByLabelLikelihood getOptimizableCRF (InstanceList trainingSet) {
if (cachedWeightsStructureStamp != crf.weightsStructureChangeStamp) {
if (!useNoWeights) {
if (useSparseWeights)
crf.setWeightsDimensionAsIn (trainingSet, useSomeUnsupportedTrick);
crf.setWeightsDimensionDensely ();
//reallocateSufficientStatistics(); // Not necessary here because it is done in the constructor for OptimizableCRF
ocrf = null;
cachedWeightsStructureStamp = crf.weightsStructureChangeStamp;
if (ocrf == null || ocrf.trainingSet != trainingSet) {
//ocrf = new OptimizableCRF (crf, trainingSet);
ocrf = new CRFOptimizableByLabelLikelihood(crf, trainingSet);
opt = null;
return ocrf;
public Optimizer getOptimizer (InstanceList trainingSet) {
getOptimizableCRF(trainingSet); // this will set this.mcrf if necessary
if (opt == null || ocrf != opt.getOptimizable())
opt = new LimitedMemoryBFGS(ocrf); // Alternative: opt = new ConjugateGradient (0.001);
return opt;
// Java question:
// If I make a non-static inner class CRF.Trainer,
// can that class by subclassed in another .java file,
// and can that subclass still have access to all the CRF's
// instance variables?
// ANSWER: Yes and yes, but you have to use special syntax in the subclass ctor (see mallet-dev archive) -cas
public boolean trainIncremental (InstanceList training)
return train (training, Integer.MAX_VALUE);
public boolean train (InstanceList trainingSet, int numIterations) {
if (numIterations <= 0)
return false;
assert (trainingSet.size() > 0);
getOptimizableCRF(trainingSet); // This will set this.mcrf if necessary
getOptimizer(trainingSet); // This will set this.opt if necessary
boolean converged = false; ("CRF about to train with "+numIterations+" iterations");
for (int i = 0; i < numIterations; i++) {
try {
converged = opt.optimize (1);
iterationCount++; ("CRF finished one iteration of maximizer, i="+i);
} catch (IllegalArgumentException e) {
e.printStackTrace(); ("Catching exception; saying converged.");
converged = true;
} catch (Exception e) {
e.printStackTrace();"Catching exception; saying converged.");
converged = true;
if (converged) { ("CRF training has converged, i="+i);
return converged;
* Train a CRF on various-sized subsets of the data. This method is typically used to accelerate training by
* quickly getting to reasonable parameters on only a subset of the parameters first, then on progressively more data.
* @param training The training Instances.
* @param numIterationsPerProportion Maximum number of Maximizer iterations per training proportion.
* @param trainingProportions If non-null, train on increasingly
* larger portions of the data, e.g. new double[] {0.2, 0.5, 1.0}. This can sometimes speedup convergence.
* Be sure to end in 1.0 if you want to train on all the data in the end.
* @return True if training has converged.
public boolean train (InstanceList training, int numIterationsPerProportion, double[] trainingProportions)
int trainingIteration = 0;
assert (trainingProportions.length > 0);
boolean converged = false;
for (int i = 0; i < trainingProportions.length; i++) {
assert (trainingProportions[i] <= 1.0); ("Training on "+trainingProportions[i]+"% of the data this round.");
if (trainingProportions[i] == 1.0)
converged = this.train (training, numIterationsPerProportion);
converged = this.train (training.split (new Random(1),
new double[] {trainingProportions[i], 1-trainingProportions[i]})[0], numIterationsPerProportion);
trainingIteration += numIterationsPerProportion;
return converged;
public boolean trainWithFeatureInduction (InstanceList trainingData,
InstanceList validationData, InstanceList testingData,
TransducerEvaluator eval, int numIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction,
double trueLabelProbThreshold,
boolean clusteredFeatureInduction,
double[] trainingProportions)
return trainWithFeatureInduction (trainingData, validationData, testingData,
eval, numIterations, numIterationsBetweenFeatureInductions,
numFeatureInductions, numFeaturesPerFeatureInduction,
trueLabelProbThreshold, clusteredFeatureInduction,
trainingProportions, "exp");
* Train a CRF using feature induction to generate conjunctions of
* features. Feature induction is run periodically during
* training. The features are added to improve performance on the
* mislabeled instances, with the specific scoring criterion given
* by the {@link FeatureInducer} specified by <code>gainName</code>
* @param training The training Instances.
* @param validation The validation Instances.
* @param testing The testing instances.
* @param eval For evaluation during training.
* @param numIterations Maximum number of Maximizer iterations.
* @param numIterationsBetweenFeatureInductions Number of maximizer
* iterations between each call to the Feature Inducer.
* @param numFeatureInductions Maximum number of rounds of feature
* induction.
* @param numFeaturesPerFeatureInduction Maximum number of features
* to induce at each round of induction.
* @param trueLabelProbThreshold If the model's probability of the
* true Label of an Instance is less than this value, it is added as
* an error instance to the {@link FeatureInducer}.
* @param clusteredFeatureInduction If true, a separate {@link
* FeatureInducer} is constructed for each label pair. This can
* avoid inducing a disproportionate number of features for a single
* label.
* @param trainingProportions If non-null, train on increasingly
* larger portions of the data (e.g. [0.2, 0.5, 1.0]. This can
* sometimes speedup convergence.
* @param gainName The type of {@link FeatureInducer} to use. One of
* "exp", "grad", or "info" for {@link ExpGain}, {@link
* GradientGain}, or {@link InfoGain}.
* @return True if training has converged.
public boolean trainWithFeatureInduction (InstanceList trainingData,
InstanceList validationData, InstanceList testingData,
TransducerEvaluator eval, int numIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction,
double trueLabelProbThreshold,
boolean clusteredFeatureInduction,
double[] trainingProportions,
String gainName)
int trainingIteration = 0;
int numLabels = crf.outputAlphabet.size();
crf.globalFeatureSelection = trainingData.getFeatureSelection();
if (crf.globalFeatureSelection == null) {
// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
crf.globalFeatureSelection = new FeatureSelection (trainingData.getDataAlphabet());
trainingData.setFeatureSelection (crf.globalFeatureSelection);
// TODO Careful! If validationData and testingData get removed as arguments to this method
// then the next two lines of work will have to be done somewhere.
if (validationData != null) validationData.setFeatureSelection (crf.globalFeatureSelection);
if (testingData != null) testingData.setFeatureSelection (crf.globalFeatureSelection);
for (int featureInductionIteration = 0;
featureInductionIteration < numFeatureInductions;
// Print out some feature information ("Feature induction iteration "+featureInductionIteration);
// Train the CRF
InstanceList theTrainingData = trainingData;
if (trainingProportions != null && featureInductionIteration < trainingProportions.length) { ("Training on "+trainingProportions[featureInductionIteration]+"% of the data this round.");
InstanceList[] sampledTrainingData = trainingData.split (new Random(1),
new double[] {trainingProportions[featureInductionIteration],
theTrainingData = sampledTrainingData[0];
theTrainingData.setFeatureSelection (crf.globalFeatureSelection); // xxx necessary? (" which is "+theTrainingData.size()+" instances");
boolean converged = false;
if (featureInductionIteration != 0)
// Don't train until we have added some features
converged = this.train (theTrainingData, numIterationsBetweenFeatureInductions);
trainingIteration += numIterationsBetweenFeatureInductions; ("Starting feature induction with "+crf.inputAlphabet.size()+" features.");
// Create the list of error tokens, for both unclustered and clustered feature induction
InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
// This errorInstances.featureSelection will get examined by FeatureInducer,
// so it can know how to add "new" singleton features
errorInstances.setFeatureSelection (crf.globalFeatureSelection);
ArrayList errorLabelVectors = new ArrayList();
InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels];
ArrayList clusteredErrorLabelVectors[][] = new ArrayList[numLabels][numLabels];
for (int i = 0; i < numLabels; i++)
for (int j = 0; j < numLabels; j++) {
clusteredErrorInstances[i][j] = new InstanceList (trainingData.getDataAlphabet(),
clusteredErrorInstances[i][j].setFeatureSelection (crf.globalFeatureSelection);
clusteredErrorLabelVectors[i][j] = new ArrayList();
for (int i = 0; i < theTrainingData.size(); i++) { ("instance="+i);
Instance instance = theTrainingData.get(i);
Sequence input = (Sequence) instance.getData();
Sequence trueOutput = (Sequence) instance.getTarget();
assert (input.size() == trueOutput.size());
SumLattice lattice =
crf.sumLatticeFactory.newSumLattice (crf, input, (Sequence)null, (Transducer.Incrementor)null,
int prevLabelIndex = 0; // This will put extra error instances in this cluster
for (int j = 0; j < trueOutput.size(); j++) {
Label label = (Label) ((LabelSequence)trueOutput).getLabelAtPosition(j);
assert (label != null);
//System.out.println ("Instance="+i+" position="+j+" fv="+lattice.getLabelingAtPosition(j).toString(true));
LabelVector latticeLabeling = lattice.getLabelingAtPosition(j);
double trueLabelProb = latticeLabeling.value(label.getIndex());
int labelIndex = latticeLabeling.getBestIndex();
//System.out.println ("position="+j+" trueLabelProb="+trueLabelProb);
if (trueLabelProb < trueLabelProbThreshold) { ("Adding error: instance="+i+" position="+j+" prtrue="+trueLabelProb+
(label == latticeLabeling.getBestLabel() ? " " : " *")+
" truelabel="+label+
" predlabel="+latticeLabeling.getBestLabel()+
" fv="+((FeatureVector)input.get(j)).toString(true));
errorInstances.add (input.get(j), label, null, null);
errorLabelVectors.add (latticeLabeling);
clusteredErrorInstances[prevLabelIndex][labelIndex].add (input.get(j), label, null, null);
clusteredErrorLabelVectors[prevLabelIndex][labelIndex].add (latticeLabeling);
prevLabelIndex = labelIndex;
} ("Error instance list size = "+errorInstances.size());
if (clusteredFeatureInduction) {
FeatureInducer[][] klfi = new FeatureInducer[numLabels][numLabels];
for (int i = 0; i < numLabels; i++) {
for (int j = 0; j < numLabels; j++) {
// Note that we may see some "impossible" transitions here (like O->I in a OIB model)
// because we are using lattice gammas to get the predicted label, not Viterbi.
// I don't believe this does any harm, and may do some good. ("Doing feature induction for "+
crf.outputAlphabet.lookupObject(i)+" -> "+crf.outputAlphabet.lookupObject(j)+
" with "+clusteredErrorInstances[i][j].size()+" instances");
if (clusteredErrorInstances[i][j].size() < 20) { ("..skipping because only "+clusteredErrorInstances[i][j].size()+" instances.");
int s = clusteredErrorLabelVectors[i][j].size();
LabelVector[] lvs = new LabelVector[s];
for (int k = 0; k < s; k++)
lvs[k] = (LabelVector) clusteredErrorLabelVectors[i][j].get(k);
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals ("exp"))
gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
else if (gainName.equals("grad"))
gainFactory = new GradientGain.Factory (lvs);
else if (gainName.equals("info"))
gainFactory = new InfoGain.Factory ();
klfi[i][j] = new FeatureInducer (gainFactory,
for (int i = 0; i < numLabels; i++) {
for (int j = 0; j < numLabels; j++) { ("Adding new induced features for "+
crf.outputAlphabet.lookupObject(i)+" -> "+crf.outputAlphabet.lookupObject(j));
if (klfi[i][j] == null) { ("...skipping because no features induced.");
// Note that this adds features globally, but not on a per-transition basis
klfi[i][j].induceFeaturesFor (trainingData, false, false);
if (testingData != null) klfi[i][j].induceFeaturesFor (testingData, false, false);
klfi = null;
} else {
int s = errorLabelVectors.size();
LabelVector[] lvs = new LabelVector[s];
for (int i = 0; i < s; i++)
lvs[i] = (LabelVector) errorLabelVectors.get(i);
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals ("exp"))
gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
else if (gainName.equals("grad"))
gainFactory = new GradientGain.Factory (lvs);
else if (gainName.equals("info"))
gainFactory = new InfoGain.Factory ();
FeatureInducer klfi =
new FeatureInducer (gainFactory,
// Note that this adds features globally, but not on a per-transition basis
klfi.induceFeaturesFor (trainingData, false, false);
if (testingData != null) klfi.induceFeaturesFor (testingData, false, false); ("CRF4 FeatureSelection now includes "+crf.globalFeatureSelection.cardinality()+" features");
klfi = null;
// This is done in CRF4.train() anyway
//this.setWeightsDimensionAsIn (trainingData);
////this.growWeightsDimensionToInputAlphabet ();
return this.train (trainingData, numIterations - trainingIteration);
public void setUseHyperbolicPrior (boolean f) { usingHyperbolicPrior = f; }
public void setHyperbolicPriorSlope (double p) { hyperbolicPriorSlope = p; }
public void setHyperbolicPriorSharpness (double p) { hyperbolicPriorSharpness = p; }
public double getUseHyperbolicPriorSlope () { return hyperbolicPriorSlope; }
public double getUseHyperbolicPriorSharpness () { return hyperbolicPriorSharpness; }
public void setGaussianPriorVariance (double p) { gaussianPriorVariance = p; }
public double getGaussianPriorVariance () { return gaussianPriorVariance; }
//public int getDefaultFeatureIndex () { return defaultFeatureIndex;}
public void setUseSparseWeights (boolean b) { useSparseWeights = b; }
public boolean getUseSparseWeights () { return useSparseWeights; }
/** Sets whether to use the 'some unsupported trick.' This trick is, if training a CRF
* where some training has been done and sparse weights are used, to add a few weights
* for feaures that do not occur in the tainig data.
* <p>
* This generally leads to better accuracy at only a small memory cost.
* @param b Whether to use the trick
public void setUseSomeUnsupportedTrick (boolean b) { useSomeUnsupportedTrick = b; }
// Serialization for CRFTrainerByLikelihood
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
static final int NULL_INTEGER = -1;
/* Need to check for null pointers. */
private void writeObject (ObjectOutputStream out) throws IOException {
int i, size;
out.writeBoolean (useSparseWeights);
throw new IllegalStateException("Implementation not yet complete.");
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int size, i;
int version = in.readInt ();
//defaultFeatureIndex = in.readInt();
usingHyperbolicPrior = in.readBoolean();
gaussianPriorVariance = in.readDouble();
hyperbolicPriorSlope = in.readDouble();
hyperbolicPriorSharpness = in.readDouble();
printGradient = in.readBoolean();
useSparseWeights = in.readBoolean();
throw new IllegalStateException("Implementation not yet complete.");