/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.classify;
//package edu.umass.cs.mallet.users.culotta.cluster.classify;
//import edu.umass.cs.mallet.base.classify.*;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import cc.mallet.optimize.ConjugateGradient;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizable;
import cc.mallet.optimize.Optimizer;
import cc.mallet.types.Alphabet;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.FeatureVectorSequence;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labels;
import cc.mallet.types.MatrixOps;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import cc.mallet.util.Maths;
/**
* The trainer for a {@link RankMaxEnt} classifier. Expects Instance data to be a
* FeatureVectorSequence, and the target to be a String representation of the
* index of the true best FeatureVectorSequence. Note that the Instance target
* may be a Labels to indicate a tie for the best Instance.
*
* @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a>
*/
public class RankMaxEntTrainer extends MaxEntTrainer
{
private static Logger logger = MalletLogger.getLogger(RankMaxEntTrainer.class.getName());
private static Logger progressLogger = MalletProgressMessageLogger.getLogger(RankMaxEntTrainer.class.getName()+"-pl");
public RankMaxEntTrainer () {
}
/** Constructs a trainer with a parameter to avoid overtraining. 1.0 is
* usually a reasonable default value. */
public RankMaxEntTrainer (double gaussianPriorVariance)
{
super (gaussianPriorVariance);
}
public Optimizable.ByGradientValue getMaximizableTrainer (InstanceList ilist)
{
if (ilist == null)
return new MaximizableTrainer ();
return new MaximizableTrainer (ilist, null);
}
public MaxEnt train (InstanceList trainingSet)
{
logger.fine ("trainingSet.size() = "+trainingSet.size());
RankMaxEntTrainer.MaximizableTrainer mt =
new RankMaxEntTrainer.MaximizableTrainer (trainingSet, (RankMaxEnt)initialClassifier);
Optimizer maximizer = new LimitedMemoryBFGS(mt);
// maximizer.optimize (); // XXX given the loop below, this seems wrong.
boolean converged;
for (int i = 0; i < numIterations; i++) {
try {
converged = maximizer.optimize (1);
} catch (IllegalArgumentException e) {
e.printStackTrace();
logger.info ("Catching exception; saying converged.");
converged = true;
}
if (converged)
break;
}
if (numIterations == Integer.MAX_VALUE) {
// Run it again because in our and Sam Roweis' experience, BFGS can still
// eke out more likelihood after first convergence by re-running without
// being restricted by its gradient history.
optimizer = new ConjugateGradient(mt);
try {
optimizer.optimize ();
} catch (IllegalArgumentException e) {
e.printStackTrace();
logger.info ("Catching exception; saying converged.");
}
}
progressLogger.info("\n"); // progess messages are on one line; move on.
return mt.getClassifier ();
}
// xxx this won't work here.. must fix.
/**
* <p>Like the other version of <code>trainWithFeatureInduction</code>, but
* allows some default options to be changed.</p>
*
* @param maxent An initial partially-trained classifier (default <code>null</code>).
* This classifier may be modified during training.
* @param gainName The estimate of gain (log-likelihood increase) we want our chosen
* features to maximize.
* Should be one of <code>MaxEntTrainer.EXP_GAIN</code>,
* <code>MaxEntTrainer.GRADIENT_GAIN</code>, or
* <code>MaxEntTrainer.INFORMATION_GAIN</code> (default <code>EXP_GAIN</code>).
*
* @return The trained <code>MaxEnt</code> classifier
*/
/*
public Classifier trainWithFeatureInduction (InstanceList trainingData,
InstanceList validationData,
InstanceList testingData,
ClassifierEvaluating evaluator,
MaxEnt maxent,
int totalIterations,
int numIterationsBetweenFeatureInductions,
int numFeatureInductions,
int numFeaturesPerFeatureInduction,
String gainName) {
// XXX This ought to be a parameter, except that setting it to true can
// crash training ("Jump too small").
boolean saveParametersDuringFI = false;
Alphabet inputAlphabet = trainingData.getDataAlphabet();
Alphabet outputAlphabet = trainingData.getTargetAlphabet();
if (maxent == null)
maxent = new RankMaxEnt(trainingData.getPipe(),
new double[(1+inputAlphabet.size()) * outputAlphabet.size()]);
int trainingIteration = 0;
int numLabels = outputAlphabet.size();
// Initialize feature selection
FeatureSelection globalFS = trainingData.getFeatureSelection();
if (globalFS == null) {
// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
globalFS = new FeatureSelection (trainingData.getDataAlphabet());
trainingData.setFeatureSelection (globalFS);
}
if (validationData != null) validationData.setFeatureSelection (globalFS);
if (testingData != null) testingData.setFeatureSelection (globalFS);
maxent = new RankMaxEnt(maxent.getInstancePipe(), maxent.getParameters(), globalFS);
// Run feature induction
for (int featureInductionIteration = 0;
featureInductionIteration < numFeatureInductions;
featureInductionIteration++) {
// Print out some feature information
logger.info ("Feature induction iteration "+featureInductionIteration);
// Train the model a little bit. We don't care whether it converges; we
// execute all feature induction iterations no matter what.
if (featureInductionIteration != 0) {
// Don't train until we have added some features
setNumIterations(numIterationsBetweenFeatureInductions);
maxent = (RankMaxEnt)this.train (trainingData, validationData, testingData, evaluator,
maxent);
}
trainingIteration += numIterationsBetweenFeatureInductions;
logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+
" features over "+numLabels+" labels.");
// Create the list of error tokens
// InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
// trainingData.getTargetAlphabet());
InstanceList errorInstances = new InstanceList (inputAlphabet, outputAlphabet);
// This errorInstances.featureSelection will get examined by FeatureInducer,
// so it can know how to add "new" singleton features
errorInstances.setFeatureSelection (globalFS);
List errorLabelVectors = new ArrayList(); // these are length-1 vectors
for (int i = 0; i < trainingData.size(); i++) {
Instance inst = trainingData.get(i);
// Having trained using just the current features, see how we classify
// the training data now.
Classification classification = maxent.classify(inst);
if (!classification.bestLabelIsCorrect()) {
InstanceList il = (InstanceList) inst.getData();
Instance subInstance =
il.get(((Integer)inst.getLabeling().getBestLabel().getEntry()).intValue());
errorInstances.add(subInstance);
errorLabelVectors.add(classification.getLabelVector());
// errorLabelVectors.add(createLabelVector(subInstance, classification));
}
}
logger.info ("Error instance list size = "+errorInstances.size());
int s = errorLabelVectors.size();
LabelVector[] lvs = new LabelVector[s];
for (int i = 0; i < s; i++) {
lvs[i] = (LabelVector)errorLabelVectors.get(i);
}
RankedFeatureVector.Factory gainFactory = null;
if (gainName.equals (EXP_GAIN))
gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
else if (gainName.equals(GRADIENT_GAIN))
gainFactory = new GradientGain.Factory (lvs);
else if (gainName.equals(INFORMATION_GAIN))
gainFactory = new InfoGain.Factory ();
else
throw new IllegalArgumentException("Unsupported gain name: "+gainName);
FeatureInducer klfi =
new FeatureInducer (gainFactory,
errorInstances,
numFeaturesPerFeatureInduction,
2*numFeaturesPerFeatureInduction,
2*numFeaturesPerFeatureInduction);
// Note that this adds features globally, but not on a per-transition basis
klfi.induceFeaturesFor (trainingData, false, false);
if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
logger.info ("MaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features");
klfi = null;
double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()];
// XXX (Executing this block often causes an error during training; I don't know why.)
if (saveParametersDuringFI) {
// Keep current parameter values
// XXX This relies on the implementation detail that the most recent features
// added to an Alphabet get the highest indices.
// Count parameters per output label
int oldParamCount = maxent.parameters.length / outputAlphabet.size();
int newParamCount = 1+inputAlphabet.size();
// Copy params into the proper locations
for (int i=0; i<outputAlphabet.size(); i++) {
System.arraycopy(maxent.parameters, i*oldParamCount,
newParameters, i*newParamCount,
oldParamCount);
}
for (int i=0; i<oldParamCount; i++)
if (maxent.parameters[i] != newParameters[i]) {
System.out.println(maxent.parameters[i]+" "+newParameters[i]);
System.exit(0);
}
}
maxent.parameters = newParameters;
maxent.defaultFeatureIndex = inputAlphabet.size();
}
// Finished feature induction
logger.info("Ended with "+globalFS.cardinality()+" features.");
setNumIterations(totalIterations - trainingIteration);
return this.train (trainingData, validationData, testingData,
evaluator, maxent);
}
*/
public String toString()
{
return "RankMaxEntTrainer"
// + "("+maximizerClass.getName()+") "
+ ",numIterations=" + numIterations
+ ",gaussianPriorVariance="+gaussianPriorVariance;
}
// A private inner class that wraps up a RankMaxEnt
// classifier and its training data. The result is a
// maximize.Maximizable function.
private class MaximizableTrainer implements Optimizable.ByGradientValue
{
double[] parameters, constraints, cachedGradient;
RankMaxEnt theClassifier;
InstanceList trainingList;
// The expectations are (temporarily) stored in the cachedGradient
double cachedValue;
boolean cachedValueStale;
boolean cachedGradientStale;
int numLabels;
int numFeatures;
int defaultFeatureIndex; // just for clarity
FeatureSelection featureSelection;
FeatureSelection[] perLabelFeatureSelection;
public MaximizableTrainer (){}
public MaximizableTrainer (InstanceList ilist, RankMaxEnt initialClassifier)
{
this.trainingList = ilist;
Alphabet fd = ilist.getDataAlphabet();
LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet();
// Don't fd.stopGrowth, because someone might want to do feature induction
//ld.stopGrowth();
// Add one feature for the "default feature".
// assume underlying Instances are binary
//this.numLabels = underlyingLabelAlphabet.size();
// xxx
this.numLabels = 2;
this.numFeatures = fd.size() + 1;
this.defaultFeatureIndex = numFeatures-1;
this.parameters = new double [numLabels * numFeatures];
this.constraints = new double [numLabels * numFeatures];
this.cachedGradient = new double [numLabels * numFeatures];
Arrays.fill (parameters, 0.0);
Arrays.fill (constraints, 0.0);
Arrays.fill (cachedGradient, 0.0);
this.featureSelection = ilist.getFeatureSelection();
this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection();
// Add the default feature index to the selection
if (featureSelection != null)
featureSelection.add (defaultFeatureIndex);
if (perLabelFeatureSelection != null)
for (int i = 0; i < perLabelFeatureSelection.length; i++)
perLabelFeatureSelection[i].add (defaultFeatureIndex);
// xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
assert (featureSelection == null || perLabelFeatureSelection == null);
if (initialClassifier != null) {
this.theClassifier = initialClassifier;
this.parameters = theClassifier.parameters;
this.featureSelection = theClassifier.featureSelection;
this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
assert (initialClassifier.getInstancePipe() == ilist.getPipe());
}
else if (this.theClassifier == null) {
this.theClassifier = new RankMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
}
cachedValueStale = true;
cachedGradientStale = true;
// Initialize the constraints, using only the constraints from
// the "positive" instance
Iterator<Instance> iter = trainingList.iterator ();
logger.fine("Number of instances in training list = " + trainingList.size());
while (iter.hasNext()) {
Instance instance = iter.next();
double instanceWeight = trainingList.getInstanceWeight(instance);
FeatureVectorSequence fvs = (FeatureVectorSequence) instance.getData();
// label of best instance in subList
Object target = instance.getTarget();
Label label = null;
if (target instanceof Labels)
label = ((Labels)target).get(0);
else label = (Label)target;
int positiveIndex =
Integer.valueOf(label.getBestLabel().getEntry().toString()).intValue();
if (positiveIndex == -1) { // invalid instance
logger.warning("True label is -1. Skipping...");
continue;
}
FeatureVector fv = (FeatureVector)fvs.get(positiveIndex);
Alphabet fdict = fv.getAlphabet();
assert (fv.getAlphabet() == fd);
// xxx ensure dimensionality of constraints correct
MatrixOps.rowPlusEquals (constraints, numFeatures, 0, fv, instanceWeight);
// For the default feature, whose weight is 1.0
assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";
//assert(!Double.isNaN(li)) : "bestIndex is NaN";
boolean hasNaN = false;
for(int i = 0; i < fv.numLocations(); i++) {
if(Double.isNaN(fv.valueAtLocation(i))) {
logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
hasNaN = true;
}
}
if(hasNaN)
logger.info("NaN in instance: " + instance.getName());
// default constraints for positive instances xxx
constraints[0*numFeatures + defaultFeatureIndex] += 1.0 * instanceWeight;
}
//TestMaximizable.testValueAndGradientCurrentParameters (this);
}
public RankMaxEnt getClassifier () { return theClassifier; }
public double getParameter (int index) {
return parameters[index];
}
public void setParameter (int index, double v) {
cachedValueStale = true;
cachedGradientStale = true;
parameters[index] = v;
}
public int getNumParameters() {
return parameters.length;
}
public void getParameters (double[] buff) {
if (buff == null || buff.length != parameters.length)
buff = new double [parameters.length];
System.arraycopy (parameters, 0, buff, 0, parameters.length);
}
public void setParameters (double [] buff) {
assert (buff != null);
cachedValueStale = true;
cachedGradientStale = true;
if (buff.length != parameters.length)
parameters = new double[buff.length];
System.arraycopy (buff, 0, parameters, 0, buff.length);
}
// log probability of the training labels, which here means the
// probability of the positive example being labeled as such
public double getValue ()
{
if (cachedValueStale) {
cachedValue = 0;
// We'll store the expectation values in "cachedGradient" for now
cachedGradientStale = true;
MatrixOps.setAll (cachedGradient, 0.0);
// Incorporate likelihood of data
double value = 0.0;
Iterator<Instance> iter = trainingList.iterator();
int ii=0;
while (iter.hasNext()) {
ii++;
Instance instance = iter.next();
FeatureVectorSequence fvs = (FeatureVectorSequence) instance.getData();
// scores stores Pr of subList[i] being positive instance
double[] scores = new double[fvs.size()];
double instanceWeight = trainingList.getInstanceWeight(instance);
// labeling is a String representation of an int, indicating which FeatureVector from
// the subList is the positive example
// If is String, proceed as usual. Else, if is String[], do
// not penalize scores for duplicate entries. This improved accuracy in some expts.
Object target = instance.getTarget();
int li = -1;
if (target instanceof Label) {
li = Integer.valueOf(((Label)target).toString()).intValue();
if (li == -1) // hack to avoid invalid instances
continue;
assert (li >=0 && li < fvs.size());
this.theClassifier.getClassificationScores (instance, scores);
} else if (target instanceof Labels){
Labels labels = (Labels)target;
int[] bestPositions = new int[labels.size()];
for (int pi = 0; pi < labels.size(); pi++)
bestPositions[pi] = Integer.valueOf(labels.get(pi).toString());
li = bestPositions[0];
this.theClassifier.getClassificationScoresForTies (instance, scores, bestPositions);
}
value = - (instanceWeight * Math.log (scores[li]));
if(Double.isNaN(value)) {
logger.fine ("MaxEntTrainer: Instance " + instance.getName() +
"has NaN value. log(scores)= " + Math.log(scores[li]) +
" scores = " + scores[li] +
" has instance weight = " + instanceWeight);
}
if (Double.isInfinite(value)) {
logger.warning ("Instance "+instance.getSource() + " has infinite value; skipping value and gradient");
cachedValue -= value;
cachedValueStale = false;
return -value;
}
cachedValue += value;
double positiveScore = scores[li];
for (int si=0; si < fvs.size(); si++) {
if (scores[si]==0)
continue;
assert (!Double.isInfinite(scores[si]));
FeatureVector cfv = (FeatureVector)fvs.get(si);
MatrixOps.rowPlusEquals (cachedGradient, numFeatures,
0, cfv, -instanceWeight * scores[si]);
cachedGradient[numFeatures*0 + defaultFeatureIndex] += (-instanceWeight * scores[si]);
}
}
// Incorporate prior on parameters
for (int li = 0; li < numLabels; li++)
for (int fi = 0; fi < numFeatures; fi++) {
double param = parameters[li*numFeatures + fi];
cachedValue += param * param / (2 * gaussianPriorVariance);
}
cachedValue *= -1.0; // MAXIMIZE, NOT MINIMIZE
cachedValueStale = false;
progressLogger.info ("Value (loglikelihood) = "+cachedValue);
}
return cachedValue;
}
public void getValueGradient (double [] buffer)
{
// Gradient is (constraint - expectation - parameters/gaussianPriorVariance)
if (cachedGradientStale) {
if (cachedValueStale)
// This will fill in the cachedGradient with the "-expectation"
getValue ();
MatrixOps.plusEquals (cachedGradient, constraints);
// Incorporate prior on parameters
MatrixOps.plusEquals (cachedGradient, parameters, -1.0 / gaussianPriorVariance);
// A parameter may be set to -infinity by an external user.
// We set gradient to 0 because the parameter's value can
// never change anyway and it will mess up future calculations
// on the matrix, such as norm().
MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);
// Set to zero all the gradient dimensions that are not among the selected features
if (perLabelFeatureSelection == null) {
for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
MatrixOps.rowSetAll (cachedGradient, numFeatures,
labelIndex, 0.0, featureSelection, false);
} else {
for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
MatrixOps.rowSetAll (cachedGradient, numFeatures,
labelIndex, 0.0,
perLabelFeatureSelection[labelIndex], false);
}
cachedGradientStale = false;
}
assert (buffer != null && buffer.length == parameters.length);
System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
}
}
// SERIALIZATION
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject (ObjectOutputStream out) throws IOException {
out.defaultWriteObject ();
out.writeInt (CURRENT_SERIAL_VERSION);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
in.defaultReadObject ();
int version = in.readInt ();
}
}