Source Code of cc.mallet.fst.CRFTrainerByLabelLikelihood

package cc.mallet.fst;


import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Random;
import java.util.logging.Logger;


import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizer;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.LabelVector;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Sequence;
import cc.mallet.util.MalletLogger;




/**
 * Unlike ClassifierTrainer, TransducerTrainer is not "stateless" between calls
 * to train. A TransducerTrainer is constructed paired with a specific
 * Transducer, and can only train that Transducer. CRF stores and has methods
 * for FeatureSelection and weight freezing. CRFTrainer stores and has methods
 * for determining the contents/dimensions/sparsity/FeatureInduction of the
 * CRF's weights as determined by training data.
 * <p>
 * <b>Note:</b> In the future this class may go away in favor of some default 
 * version of CRFTrainerByValueGradients.
 */
public class CRFTrainerByLabelLikelihood extends TransducerTrainer implements TransducerTrainer.ByOptimization {
  private static Logger logger = MalletLogger.getLogger(CRFTrainerByLabelLikelihood.class.getName());


  static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1.0;
  static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
  static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;


  CRF crf;
  //OptimizableCRF ocrf;
  CRFOptimizableByLabelLikelihood ocrf;
  Optimizer opt;
  int iterationCount = 0;
  boolean converged;
  
  boolean usingHyperbolicPrior = false;
  double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
  double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
  double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
  boolean useSparseWeights = true;
  boolean useNoWeights = false; // TODO remove this; it is just for debugging
  private transient boolean useSomeUnsupportedTrick = true;


  // Various values from CRF acting as indicators of when we need to ...
  private int cachedValueWeightsStamp = -1;  // ... re-calculate expectations and values to getValue() because weights' values changed
  private int cachedGradientWeightsStamp = -1; // ... re-calculate to getValueGradient() because weights' values changed
  private int cachedWeightsStructureStamp = -1; // ... re-allocate crf.weights, expectations & constraints because new states, transitions
  // Use mcrf.trainingSet to see when we need to re-allocate crf.weights, expectations & constraints because we are using a different TrainingList than last time


  // xxx temporary hack.  This is quite useful to have, though!! -cas
  public boolean printGradient = false;


  
  
  public CRFTrainerByLabelLikelihood (CRF crf) {
    this.crf = crf;
  }
  
  public Transducer getTransducer() { return crf; }
  public CRF getCRF () { return crf; }
  public Optimizer getOptimizer() { return opt; }
  public boolean isConverged() { return converged; }
  public boolean isFinishedTraining() { return converged; }
  public int getIteration () { return iterationCount; }
  


  /**
   * Use this method to specify whether or not factors
   * are added to the CRF by this trainer.  If you have
   * already setup the factors in your CRF, you may
   * not want the trainer to add additional factors. 
   * 
   * @param flag If true, this trainer adds no factors to the CRF.
   */
  public void setAddNoFactors(boolean flag) {
    this.useNoWeights = flag;
  }


  public CRFOptimizableByLabelLikelihood getOptimizableCRF (InstanceList trainingSet) {
    if (cachedWeightsStructureStamp != crf.weightsStructureChangeStamp) {
      if (!useNoWeights) {
        if (useSparseWeights)
          crf.setWeightsDimensionAsIn (trainingSet, useSomeUnsupportedTrick);  
        else 
          crf.setWeightsDimensionDensely ();
      }
      //reallocateSufficientStatistics(); // Not necessary here because it is done in the constructor for OptimizableCRF
      ocrf = null;
      cachedWeightsStructureStamp = crf.weightsStructureChangeStamp;
    }
    if (ocrf == null || ocrf.trainingSet != trainingSet) {
      //ocrf = new OptimizableCRF (crf, trainingSet);
      ocrf = new CRFOptimizableByLabelLikelihood(crf, trainingSet);
      ocrf.setGaussianPriorVariance(gaussianPriorVariance);
      ocrf.setHyperbolicPriorSharpness(hyperbolicPriorSharpness);
      ocrf.setHyperbolicPriorSlope(hyperbolicPriorSlope);
      ocrf.setUseHyperbolicPrior(usingHyperbolicPrior);
      opt = null;
    }
    return ocrf;
  }
  
  public Optimizer getOptimizer (InstanceList trainingSet) {
    getOptimizableCRF(trainingSet); // this will set this.mcrf if necessary
    if (opt == null || ocrf != opt.getOptimizable())
      opt = new LimitedMemoryBFGS(ocrf);  // Alternative: opt = new ConjugateGradient (0.001);
    return opt;
  }
  
  // Java question:
  // If I make a non-static inner class CRF.Trainer,
  // can that class by subclassed in another .java file,
  // and can that subclass still have access to all the CRF's
  // instance variables?
  // ANSWER: Yes and yes, but you have to use special syntax in the subclass ctor (see mallet-dev archive) -cas




  public boolean trainIncremental (InstanceList training)
  {
    return train (training, Integer.MAX_VALUE);
  }




  public boolean train (InstanceList trainingSet, int numIterations) {
    if (numIterations <= 0)
      return false;
    assert (trainingSet.size() > 0);


    getOptimizableCRF(trainingSet); // This will set this.mcrf if necessary
    getOptimizer(trainingSet); // This will set this.opt if necessary


    boolean converged = false;
    logger.info ("CRF about to train with "+numIterations+" iterations");
    for (int i = 0; i < numIterations; i++) {
      try {
        converged = opt.optimize (1);
        iterationCount++;
        logger.info ("CRF finished one iteration of maximizer, i="+i);
        runEvaluators();
      } catch (IllegalArgumentException e) {
        e.printStackTrace();
        logger.info ("Catching exception; saying converged.");
        converged = true;
      } catch (Exception e) {
        e.printStackTrace();
        logger.info("Catching exception; saying converged.");
        converged = true;
      }
      if (converged) {
        logger.info ("CRF training has converged, i="+i);
        break;
      }
    }
    return converged;
  }
  
  


  /**
   * Train a CRF on various-sized subsets of the data.  This method is typically used to accelerate training by 
   * quickly getting to reasonable parameters on only a subset of the parameters first, then on progressively more data. 
   * @param training The training Instances.
   * @param numIterationsPerProportion Maximum number of Maximizer iterations per training proportion.
   * @param trainingProportions If non-null, train on increasingly
   * larger portions of the data, e.g. new double[] {0.2, 0.5, 1.0}.  This can sometimes speedup convergence. 
   * Be sure to end in 1.0 if you want to train on all the data in the end.  
   * @return True if training has converged.
   */
  public boolean train (InstanceList training, int numIterationsPerProportion, double[] trainingProportions)
  {
    int trainingIteration = 0;
    assert (trainingProportions.length > 0);
    boolean converged = false;
    for (int i = 0; i < trainingProportions.length; i++) {
      assert (trainingProportions[i] <= 1.0);
      logger.info ("Training on "+trainingProportions[i]+"% of the data this round.");
      if (trainingProportions[i] == 1.0)
        converged = this.train (training, numIterationsPerProportion);
      else 
        converged = this.train (training.split (new Random(1),  
            new double[] {trainingProportions[i],  1-trainingProportions[i]})[0], numIterationsPerProportion);
      trainingIteration += numIterationsPerProportion;
    }
    return converged;
  }


  public boolean trainWithFeatureInduction (InstanceList trainingData,
                                            InstanceList validationData, InstanceList testingData,
                                            TransducerEvaluator eval, int numIterations,
                                            int numIterationsBetweenFeatureInductions,
                                            int numFeatureInductions,
                                            int numFeaturesPerFeatureInduction,
                                            double trueLabelProbThreshold,
                                            boolean clusteredFeatureInduction,
                                            double[] trainingProportions)
  {
    return trainWithFeatureInduction (trainingData, validationData, testingData,
        eval, numIterations, numIterationsBetweenFeatureInductions,
        numFeatureInductions, numFeaturesPerFeatureInduction,
        trueLabelProbThreshold, clusteredFeatureInduction,
        trainingProportions, "exp");
  }


  /**
   * Train a CRF using feature induction to generate conjunctions of
   * features. Feature induction is run periodically during
   * training. The features are added to improve performance on the
   * mislabeled instances, with the specific scoring criterion given
   * by the {@link FeatureInducer} specified by <code>gainName</code>
   *
   * @param training The training Instances.
   * @param validation The validation Instances.
   * @param testing The testing instances.
   * @param eval For evaluation during training.
   * @param numIterations Maximum number of Maximizer iterations.
   * @param numIterationsBetweenFeatureInductions Number of maximizer
   * iterations between each call to the Feature Inducer.
   * @param numFeatureInductions Maximum number of rounds of feature
   * induction.
   * @param numFeaturesPerFeatureInduction Maximum number of features
   * to induce at each round of induction.
   * @param trueLabelProbThreshold If the model's probability of the
   * true Label of an Instance is less than this value, it is added as
   * an error instance to the {@link FeatureInducer}. 
   * @param clusteredFeatureInduction If true, a separate {@link
   * FeatureInducer} is constructed for each label pair. This can
   * avoid inducing a disproportionate number of features for a single
   * label.
   * @param trainingProportions If non-null, train on increasingly
   * larger portions of the data (e.g. [0.2, 0.5, 1.0]. This can
   * sometimes speedup convergence.
   * @param gainName The type of {@link FeatureInducer} to use. One of
   * "exp", "grad", or "info" for {@link ExpGain}, {@link
   * GradientGain}, or {@link InfoGain}.
   * @return True if training has converged.
   */
  public boolean trainWithFeatureInduction (InstanceList trainingData,
                                            InstanceList validationData, InstanceList testingData,
                                            TransducerEvaluator eval, int numIterations,
                                            int numIterationsBetweenFeatureInductions,
                                            int numFeatureInductions,
                                            int numFeaturesPerFeatureInduction,
                                            double trueLabelProbThreshold,
                                            boolean clusteredFeatureInduction,
                                            double[] trainingProportions,
                                            String gainName)
  {
    int trainingIteration = 0;
    int numLabels = crf.outputAlphabet.size();


    crf.globalFeatureSelection = trainingData.getFeatureSelection();
    if (crf.globalFeatureSelection == null) {
      // Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
      crf.globalFeatureSelection = new FeatureSelection (trainingData.getDataAlphabet());
      trainingData.setFeatureSelection (crf.globalFeatureSelection);
    }
    // TODO Careful!  If validationData and testingData get removed as arguments to this method
    // then the next two lines of work will have to be done somewhere.
    if (validationData != null) validationData.setFeatureSelection (crf.globalFeatureSelection);
    if (testingData != null) testingData.setFeatureSelection (crf.globalFeatureSelection);


    for (int featureInductionIteration = 0;
    featureInductionIteration < numFeatureInductions;
    featureInductionIteration++)
    {
      // Print out some feature information
      logger.info ("Feature induction iteration "+featureInductionIteration);


      // Train the CRF
      InstanceList theTrainingData = trainingData;
      if (trainingProportions != null && featureInductionIteration < trainingProportions.length) {
        logger.info ("Training on "+trainingProportions[featureInductionIteration]+"% of the data this round.");
        InstanceList[] sampledTrainingData = trainingData.split (new Random(1),
            new double[] {trainingProportions[featureInductionIteration],
          1-trainingProportions[featureInductionIteration]});
        theTrainingData = sampledTrainingData[0];
        theTrainingData.setFeatureSelection (crf.globalFeatureSelection); // xxx necessary?
            logger.info ("  which is "+theTrainingData.size()+" instances");
      }
      boolean converged = false;
      if (featureInductionIteration != 0)
        // Don't train until we have added some features
        converged = this.train (theTrainingData, numIterationsBetweenFeatureInductions);
      trainingIteration += numIterationsBetweenFeatureInductions;


      logger.info ("Starting feature induction with "+crf.inputAlphabet.size()+" features.");


      // Create the list of error tokens, for both unclustered and clustered feature induction
      InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
          trainingData.getTargetAlphabet());
      // This errorInstances.featureSelection will get examined by FeatureInducer,
      // so it can know how to add "new" singleton features
      errorInstances.setFeatureSelection (crf.globalFeatureSelection);
      ArrayList errorLabelVectors = new ArrayList();
      InstanceList clusteredErrorInstances[][] = new InstanceList[numLabels][numLabels];
      ArrayList clusteredErrorLabelVectors[][] = new ArrayList[numLabels][numLabels];


      for (int i = 0; i < numLabels; i++)
        for (int j = 0; j < numLabels; j++) {
          clusteredErrorInstances[i][j] = new InstanceList (trainingData.getDataAlphabet(),
              trainingData.getTargetAlphabet());
          clusteredErrorInstances[i][j].setFeatureSelection (crf.globalFeatureSelection);
          clusteredErrorLabelVectors[i][j] = new ArrayList();
        }


      for (int i = 0; i < theTrainingData.size(); i++) {
        logger.info ("instance="+i);
        Instance instance = theTrainingData.get(i);
        Sequence input = (Sequence) instance.getData();
        Sequence trueOutput = (Sequence) instance.getTarget();
        assert (input.size() == trueOutput.size());
        SumLattice lattice = 
          crf.sumLatticeFactory.newSumLattice (crf, input, (Sequence)null, (Transducer.Incrementor)null,  
              (LabelAlphabet)theTrainingData.getTargetAlphabet());
        int prevLabelIndex = 0;          // This will put extra error instances in this cluster
        for (int j = 0; j < trueOutput.size(); j++) {
          Label label = (Label) ((LabelSequence)trueOutput).getLabelAtPosition(j);
          assert (label != null);
          //System.out.println ("Instance="+i+" position="+j+" fv="+lattice.getLabelingAtPosition(j).toString(true));
          LabelVector latticeLabeling = lattice.getLabelingAtPosition(j);
          double trueLabelProb = latticeLabeling.value(label.getIndex());
          int labelIndex = latticeLabeling.getBestIndex();
          //System.out.println ("position="+j+" trueLabelProb="+trueLabelProb);
          if (trueLabelProb < trueLabelProbThreshold) {
            logger.info ("Adding error: instance="+i+" position="+j+" prtrue="+trueLabelProb+
                (label == latticeLabeling.getBestLabel() ? "  " : " *")+
                " truelabel="+label+
                " predlabel="+latticeLabeling.getBestLabel()+
                " fv="+((FeatureVector)input.get(j)).toString(true));
            errorInstances.add (input.get(j), label, null, null);
            errorLabelVectors.add (latticeLabeling);
            clusteredErrorInstances[prevLabelIndex][labelIndex].add (input.get(j), label, null, null);
            clusteredErrorLabelVectors[prevLabelIndex][labelIndex].add (latticeLabeling);
          }
          prevLabelIndex = labelIndex;
        }
      }
      logger.info ("Error instance list size = "+errorInstances.size());
      if (clusteredFeatureInduction) {
        FeatureInducer[][] klfi = new FeatureInducer[numLabels][numLabels];
        for (int i = 0; i < numLabels; i++) {
          for (int j = 0; j < numLabels; j++) {
            // Note that we may see some "impossible" transitions here (like O->I in a OIB model)
            // because we are using lattice gammas to get the predicted label, not Viterbi.
            // I don't believe this does any harm, and may do some good.
            logger.info ("Doing feature induction for "+
                crf.outputAlphabet.lookupObject(i)+" -> "+crf.outputAlphabet.lookupObject(j)+
                " with "+clusteredErrorInstances[i][j].size()+" instances");
            if (clusteredErrorInstances[i][j].size() < 20) {
              logger.info ("..skipping because only "+clusteredErrorInstances[i][j].size()+" instances.");
              continue;
            }
            int s = clusteredErrorLabelVectors[i][j].size();
            LabelVector[] lvs = new LabelVector[s];
            for (int k = 0; k < s; k++)
              lvs[k] = (LabelVector) clusteredErrorLabelVectors[i][j].get(k);
            RankedFeatureVector.Factory gainFactory = null;
            if (gainName.equals ("exp"))
              gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
            else if (gainName.equals("grad"))
              gainFactory =  new GradientGain.Factory (lvs);
            else if (gainName.equals("info"))
              gainFactory =  new InfoGain.Factory ();
            klfi[i][j] = new FeatureInducer (gainFactory,
                clusteredErrorInstances[i][j], 
                numFeaturesPerFeatureInduction,
                2*numFeaturesPerFeatureInduction,
                2*numFeaturesPerFeatureInduction);
            crf.featureInducers.add(klfi[i][j]);
          }
        }
        for (int i = 0; i < numLabels; i++) {
          for (int j = 0; j < numLabels; j++) {
            logger.info ("Adding new induced features for "+
                crf.outputAlphabet.lookupObject(i)+" -> "+crf.outputAlphabet.lookupObject(j));
            if (klfi[i][j] == null) {
              logger.info ("...skipping because no features induced.");
              continue;
            }
            // Note that this adds features globally, but not on a per-transition basis
            klfi[i][j].induceFeaturesFor (trainingData, false, false);
            if (testingData != null) klfi[i][j].induceFeaturesFor (testingData, false, false);
          }
        }
        klfi = null;
      } else {
        int s = errorLabelVectors.size();
        LabelVector[] lvs = new LabelVector[s];
        for (int i = 0; i < s; i++)
          lvs[i] = (LabelVector) errorLabelVectors.get(i);


        RankedFeatureVector.Factory gainFactory = null;
        if (gainName.equals ("exp"))
          gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
        else if (gainName.equals("grad"))
          gainFactory =  new GradientGain.Factory (lvs);
        else if (gainName.equals("info"))
          gainFactory =  new InfoGain.Factory ();
        FeatureInducer klfi =
          new FeatureInducer (gainFactory,
              errorInstances, 
              numFeaturesPerFeatureInduction,
              2*numFeaturesPerFeatureInduction,
              2*numFeaturesPerFeatureInduction);
        crf.featureInducers.add(klfi);
        // Note that this adds features globally, but not on a per-transition basis
        klfi.induceFeaturesFor (trainingData, false, false);
        if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
        logger.info ("CRF4 FeatureSelection now includes "+crf.globalFeatureSelection.cardinality()+" features");
        klfi = null;
      }
      // This is done in CRF4.train() anyway
      //this.setWeightsDimensionAsIn (trainingData);
      ////this.growWeightsDimensionToInputAlphabet ();
    }
    return this.train (trainingData, numIterations - trainingIteration);
  }
  
  
  
  
  
  public void setUseHyperbolicPrior (boolean f) { usingHyperbolicPrior = f; }
  public void setHyperbolicPriorSlope (double p) { hyperbolicPriorSlope = p; }
  public void setHyperbolicPriorSharpness (double p) { hyperbolicPriorSharpness = p; }
  public double getUseHyperbolicPriorSlope () { return hyperbolicPriorSlope; }
  public double getUseHyperbolicPriorSharpness () { return hyperbolicPriorSharpness; }
  public void setGaussianPriorVariance (double p) { gaussianPriorVariance = p; }
  public double getGaussianPriorVariance () { return gaussianPriorVariance; }
  //public int getDefaultFeatureIndex () { return defaultFeatureIndex;}
  
  public void setUseSparseWeights (boolean b) { useSparseWeights = b; }
  public boolean getUseSparseWeights () { return useSparseWeights; }


  /** Sets whether to use the 'some unsupported trick.' This trick is, if training a CRF
   * where some training has been done and sparse weights are used, to add a few weights
   * for feaures that do not occur in the tainig data.
   * <p>
   * This generally leads to better accuracy at only a  small memory cost.
   *
   * @param b Whether to use the trick
   */
  public void setUseSomeUnsupportedTrick (boolean b) { useSomeUnsupportedTrick = b; }
  


  




  // Serialization for CRFTrainerByLikelihood


  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 1;
  static final int NULL_INTEGER = -1;


  /* Need to check for null pointers. */
  private void writeObject (ObjectOutputStream out) throws IOException {
    int i, size;
    out.writeInt (CURRENT_SERIAL_VERSION);
    //out.writeInt(defaultFeatureIndex);
    out.writeBoolean(usingHyperbolicPrior);
    out.writeDouble(gaussianPriorVariance);
    out.writeDouble(hyperbolicPriorSlope);
    out.writeDouble(hyperbolicPriorSharpness);
    out.writeInt(cachedGradientWeightsStamp);
    out.writeInt(cachedValueWeightsStamp);
    out.writeInt(cachedWeightsStructureStamp);
    out.writeBoolean(printGradient);
    out.writeBoolean (useSparseWeights);
    throw new IllegalStateException("Implementation not yet complete.");    
  }
  
  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int size, i;
    int version = in.readInt ();
    //defaultFeatureIndex = in.readInt();
    usingHyperbolicPrior = in.readBoolean();
    gaussianPriorVariance = in.readDouble();
    hyperbolicPriorSlope = in.readDouble();
    hyperbolicPriorSharpness = in.readDouble();
    printGradient = in.readBoolean();
    useSparseWeights = in.readBoolean();
    throw new IllegalStateException("Implementation not yet complete.");    
  }
  
  
}
Source Code of cc.mallet.fst.CRFTrainerByLabelLikelihood

Related Classes of cc.mallet.fst.CRFTrainerByLabelLikelihood