Package cc.mallet.classify

Source Code of cc.mallet.classify.MCMaxEntTrainer$MaximizableTrainer

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org.  For further
information, see the file `LICENSE' included with this distribution. */





package cc.mallet.classify;


import java.util.logging.*;
import java.util.*;
import java.io.*;

import cc.mallet.classify.Classifier;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizable;
import cc.mallet.optimize.Optimizer;
import cc.mallet.optimize.tests.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labeling;
import cc.mallet.types.MatrixOps;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Vector;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import cc.mallet.util.Maths;

// Does not currently handle instances that are labeled with distributions
// instead of a single label.
/**
* The trainer for a Maximum Entropy classifier.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/

public class MCMaxEntTrainer extends ClassifierTrainer<MCMaxEnt> implements Boostable, Serializable //implements CommandOption.ListProviding
{
  private static Logger logger = MalletLogger.getLogger(MCMaxEntTrainer.class.getName());
  private static Logger progressLogger = MalletProgressMessageLogger.getLogger(MCMaxEntTrainer.class.getName()+"-pl");

  int numGetValueCalls = 0;
  int numGetValueGradientCalls = 0;
  int numIterations = 10;

  public static final String EXP_GAIN = "exp";
  public static final String GRADIENT_GAIN = "grad";
  public static final String INFORMATION_GAIN = "info";

  // xxx Why does TestMaximizable fail when this variance is very small?
  static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = .1// note used to be 1
  static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
  static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;
  static final Class DEFAULT_MAXIMIZER_CLASS = LimitedMemoryBFGS.class;

  // CPAL
  boolean usingMultiConditionalTraining = true;
  boolean usingHyperbolicPrior = false;
  double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
  double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
  double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
  Class maximizerClass = DEFAULT_MAXIMIZER_CLASS;
  double generativeWeighting = 1.0;
  MaximizableTrainer mt;
  MCMaxEnt initialClassifier;

  // CPAL
  static CommandOption.Boolean usingMultiConditionalTrainingOption =
      new CommandOption.Boolean (MCMaxEntTrainer.class, "useMCTraining", "true|false", true, true,
                                 "Use MultiConditional Training", null);
  static CommandOption.Boolean usingHyperbolicPriorOption =
      new CommandOption.Boolean (MCMaxEntTrainer.class, "useHyperbolicPrior", "true|false", false, false,
                                 "Use hyperbolic (close to L1 penalty) prior over parameters", null);
  static CommandOption.Double gaussianPriorVarianceOption =
      new CommandOption.Double (MCMaxEntTrainer.class, "gaussianPriorVariance", "FLOAT", true, 10.0,
                                "Variance of the gaussian prior over parameters", null);
  static CommandOption.Double hyperbolicPriorSlopeOption =
      new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSlope", "FLOAT", true, 0.2,
                                "Slope of the (L1 penalty) hyperbolic prior over parameters", null);
  static CommandOption.Double hyperbolicPriorSharpnessOption =
      new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSharpness", "FLOAT", true, 10.0,
                                "Sharpness of the (L1 penalty) hyperbolic prior over parameters", null);

  static final CommandOption.List commandOptions =
      new CommandOption.List (
          "MCMaximum Entropy Classifier",
          new CommandOption[] {
            usingHyperbolicPriorOption,
            gaussianPriorVarianceOption,
            hyperbolicPriorSlopeOption,
            hyperbolicPriorSharpnessOption,
            usingMultiConditionalTrainingOption,   // CPAL
          });

  public static CommandOption.List getCommandOptionList ()
  {
    return commandOptions;
  }

  /*
  public MCMaxEntTrainer(Maximizer.ByGradient maximizer)
  {
  this.maximizerByGradient = maximizer;
  this.usingHyperbolicPrior = false;
  }
  */

  public MCMaxEntTrainer (CommandOption.List col)
  {
    this.usingHyperbolicPrior = usingHyperbolicPriorOption.value;
    this.gaussianPriorVariance = gaussianPriorVarianceOption.value;
    this.hyperbolicPriorSlope = hyperbolicPriorSlopeOption.value;
    this.hyperbolicPriorSharpness = hyperbolicPriorSharpnessOption.value;
    this.usingMultiConditionalTraining = usingMultiConditionalTrainingOption.value;
  }
 
  public MCMaxEntTrainer (MCMaxEnt initialClassifier) {
    this.initialClassifier = initialClassifier;
  }

  public MCMaxEntTrainer ()
  {
    this (false);
  }

  public MCMaxEntTrainer (boolean useHyperbolicPrior)
  {
    this.usingHyperbolicPrior = useHyperbolicPrior;
  }

  /** Constructs a trainer with a parameter to avoid overtraining.  1.0 is
   * usually a reasonable default value. */
  public MCMaxEntTrainer (double gaussianPriorVariance)
  {
    this.usingHyperbolicPrior = false;
    this.gaussianPriorVariance = gaussianPriorVariance;
  }

  // CPAL - added this to do MultiConditionalTraining
  public MCMaxEntTrainer (double gaussianPriorVariance, boolean useMultiConditionalTraining )
  {
    this.usingHyperbolicPrior = false;
    this.usingMultiConditionalTraining = useMultiConditionalTraining;
    this.gaussianPriorVariance = gaussianPriorVariance;
  }

  public MCMaxEntTrainer (double hyperbolicPriorSlope,
                          double hyperbolicPriorSharpness)
  {
    this.usingHyperbolicPrior = true;
    this.hyperbolicPriorSlope = hyperbolicPriorSlope;
    this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;
  }

  public Optimizable.ByGradientValue getMaximizableTrainer (InstanceList ilist)
  {
    if (ilist == null)
      return new MaximizableTrainer ();
    return new MaximizableTrainer (ilist, null);
  }

  /**
   * Specifies the maximum number of iterations to run during a single call
   * to <code>train</code> or <code>trainWithFeatureInduction</code>.  Not
   * currently functional.
   * @return This trainer
   */
  // XXX Since we maximize before using numIterations, this doesn't work.
  // Is that a bug?  If so, should the default numIterations be higher?
  public MCMaxEntTrainer setNumIterations (int i)
  {
    numIterations = i;
    return this;
  }

  public MCMaxEntTrainer setUseHyperbolicPrior (boolean useHyperbolicPrior)
  {
    this.usingHyperbolicPrior = useHyperbolicPrior;
    return this;
  }

  /**
   * Sets a parameter to prevent overtraining.  A smaller variance for the prior
   * means that feature weights are expected to hover closer to 0, so extra
   * evidence is required to set a higher weight.
   * @return This trainer
   */
  public MCMaxEntTrainer setGaussianPriorVariance (double gaussianPriorVariance)
  {
    this.usingHyperbolicPrior = false;
    this.gaussianPriorVariance = gaussianPriorVariance;
    return this;
  }

  public MCMaxEntTrainer setHyperbolicPriorSlope(double hyperbolicPriorSlope)
  {
    this.usingHyperbolicPrior = true;
    this.hyperbolicPriorSlope = hyperbolicPriorSlope;

    return this;
  }

  public MCMaxEntTrainer setHyperbolicPriorSharpness (double hyperbolicPriorSharpness)
  {
    this.usingHyperbolicPrior = true;
    this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;

    return this;
  }
 
  public MCMaxEnt getClassifier () {
    return mt.getClassifier();
  }


  public MCMaxEnt train (InstanceList trainingSet)
  {
    logger.fine ("trainingSet.size() = "+trainingSet.size());
    mt = new MaximizableTrainer (trainingSet, (MCMaxEnt)initialClassifier);
    Optimizer maximizer = new LimitedMemoryBFGS(mt);
    // CPAL - change the tolerance for large vocab experiments
    ((LimitedMemoryBFGS)maximizer).setTolerance(.00001);    // std is .0001;
    maximizer.optimize (); // XXX given the loop below, this seems wrong.

    logger.info("MCMaxEnt ngetValueCalls:"+getValueCalls()+"\nMCMaxEnt ngetValueGradientCalls:"+getValueGradientCalls());
//    boolean converged;
//
//     for (int i = 0; i < numIterations; i++) {
//      converged = maximizer.maximize (mt, 1);
//      if (converged)
//         break;
//      else if (evaluator != null)
//         if (!evaluator.evaluate (mt.getClassifier(), converged, i, mt.getValue(),
//                                  trainingSet, validationSet, testSet))
//           break;
//    }
//    TestMaximizable.testValueAndGradient (mt);
    progressLogger.info("\n"); //  progess messages are on one line; move on.
    return mt.getClassifier ();
  }


  /**
   * <p>Like the other version of <code>trainWithFeatureInduction</code>, but
   * allows some default options to be changed.</p>
   *
   * @param maxent An initial partially-trained classifier (default <code>null</code>).
   * This classifier may be modified during training.
   * @param gainName The estimate of gain (log-likelihood increase) we want our chosen
   * features to maximize.
   * Should be one of <code>MaxEntTrainer.EXP_GAIN</code>,
   * <code>MaxEntTrainer.GRADIENT_GAIN</code>, or
   * <code>MaxEntTrainer.INFORMATION_GAIN</code> (default <code>EXP_GAIN</code>).
   *
   * @return The trained <code>MaxEnt</code> classifier
   */
  /*
  public Classifier trainWithFeatureInduction (InstanceList trainingData,
                                               InstanceList validationData,
                                               InstanceList testingData,
                                               ClassifierEvaluating evaluator,
                                               MCMaxEnt maxent,

                                               int totalIterations,
                                               int numIterationsBetweenFeatureInductions,
                                               int numFeatureInductions,
                                               int numFeaturesPerFeatureInduction,
                                               String gainName) {

    // XXX This ought to be a parameter, except that setting it to true can
    // crash training ("Jump too small").
    boolean saveParametersDuringFI = false;

    Alphabet inputAlphabet = trainingData.getDataAlphabet();
    Alphabet outputAlphabet = trainingData.getTargetAlphabet();

    if (maxent == null)
      maxent = new MCMaxEnt(trainingData.getPipe(),
                            new double[(1+inputAlphabet.size()) * outputAlphabet.size()]);

    int trainingIteration = 0;
    int numLabels = outputAlphabet.size();

    // Initialize feature selection
    FeatureSelection globalFS = trainingData.getFeatureSelection();
    if (globalFS == null) {
      // Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
      globalFS = new FeatureSelection (trainingData.getDataAlphabet());
      trainingData.setFeatureSelection (globalFS);
    }
    if (validationData != null) validationData.setFeatureSelection (globalFS);
    if (testingData != null) testingData.setFeatureSelection (globalFS);
    maxent = new MCMaxEnt(maxent.getInstancePipe(), maxent.getParameters(), globalFS);

    // Run feature induction
    for (int featureInductionIteration = 0;
         featureInductionIteration < numFeatureInductions;
         featureInductionIteration++) {

      // Print out some feature information
      logger.info ("Feature induction iteration "+featureInductionIteration);

      // Train the model a little bit.  We don't care whether it converges; we
      // execute all feature induction iterations no matter what.
      if (featureInductionIteration != 0) {
        // Don't train until we have added some features
        setNumIterations(numIterationsBetweenFeatureInductions);
        maxent = (MCMaxEnt)this.train (trainingData, validationData, testingData, evaluator,
                                       maxent);
      }
      trainingIteration += numIterationsBetweenFeatureInductions;

      logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+
                   " features over "+numLabels+" labels.");

      // Create the list of error tokens
      InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
                                                      trainingData.getTargetAlphabet());

      // This errorInstances.featureSelection will get examined by FeatureInducer,
      // so it can know how to add "new" singleton features
      errorInstances.setFeatureSelection (globalFS);
      List errorLabelVectors = new ArrayList();    // these are length-1 vectors
      for (int i = 0; i < trainingData.size(); i++) {
        Instance instance = trainingData.get(i);
        FeatureVector inputVector = (FeatureVector) instance.getData();
        Label trueLabel = (Label) instance.getTarget();

        // Having trained using just the current features, see how we classify
        // the training data now.
        Classification classification = maxent.classify(instance);
        if (!classification.bestLabelIsCorrect()) {
          errorInstances.add(inputVector, trueLabel, null, null);
          errorLabelVectors.add(classification.getLabelVector());
        }
      }
      logger.info ("Error instance list size = "+errorInstances.size());
      int s = errorLabelVectors.size();

      LabelVector[] lvs = new LabelVector[s];
      for (int i = 0; i < s; i++) {
        lvs[i] = (LabelVector)errorLabelVectors.get(i);
      }

      RankedFeatureVector.Factory gainFactory = null;
      if (gainName.equals (EXP_GAIN))
        gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
      else if (gainName.equals(GRADIENT_GAIN))
        gainFactory =  new GradientGain.Factory (lvs);
      else if (gainName.equals(INFORMATION_GAIN))
        gainFactory =  new InfoGain.Factory ();
      else
        throw new IllegalArgumentException("Unsupported gain name: "+gainName);

      FeatureInducer klfi =
          new FeatureInducer (gainFactory,
                              errorInstances,
                              numFeaturesPerFeatureInduction,
                              2*numFeaturesPerFeatureInduction,
                              2*numFeaturesPerFeatureInduction);

      // Note that this adds features globally, but not on a per-transition basis
      klfi.induceFeaturesFor (trainingData, false, false);
      if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
      logger.info ("MCMaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features");
      klfi = null;

      double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()];

      // XXX (Executing this block often causes an error during training; I don't know why.)
      if (saveParametersDuringFI) {
        // Keep current parameter values
        // XXX This relies on the implementation detail that the most recent features
        // added to an Alphabet get the highest indices.

        // Count parameters per output label
        int oldParamCount = maxent.parameters.length / outputAlphabet.size();
        int newParamCount = 1+inputAlphabet.size();
        // Copy params into the proper locations
        for (int i=0; i<outputAlphabet.size(); i++) {
          System.arraycopy(maxent.parameters, i*oldParamCount,
                           newParameters, i*newParamCount,
                           oldParamCount);
        }
        for (int i=0; i<oldParamCount; i++)
          if (maxent.parameters[i] != newParameters[i]) {
            System.out.println(maxent.parameters[i]+" "+newParameters[i]);
            System.exit(0);
          }
      }

      maxent.parameters = newParameters;
      maxent.defaultFeatureIndex = inputAlphabet.size();
    }

    // Finished feature induction
    logger.info("Ended with "+globalFS.cardinality()+" features.");
    setNumIterations(totalIterations - trainingIteration);
    return this.train (trainingData, validationData, testingData,
                       evaluator, maxent);
  }
  */

  // XXX Should these really be public?  Why?
  /** Counts how many times this trainer has computed the gradient of the
   * log probability of training labels. */
  public int getValueGradientCalls() {return numGetValueGradientCalls;}
  /** Counts how many times this trainer has computed the
   * log probability of training labels. */
  public int getValueCalls() {return numGetValueCalls;}
//  public int getIterations() {return maximizerByGradient.getIterations();}

  public String toString()
  {
    return "MCMaxEntTrainer"
    //  + "("+maximizerClass.getName()+") "
           + ",numIterations=" + numIterations
           + (usingHyperbolicPrior
              ? (",hyperbolicPriorSlope="+hyperbolicPriorSlope+
                 ",hyperbolicPriorSharpness="+hyperbolicPriorSharpness)
              : (",gaussianPriorVariance="+gaussianPriorVariance));
  }



  // A private inner class that wraps up a MCMaxEnt classifier and its training data.
  // The result is a maximize.Maximizable function.
  private class MaximizableTrainer implements Optimizable.ByGradientValue
  {
    double[] parameters, constraints, cachedGradient;
    MCMaxEnt theClassifier;
    InstanceList trainingList;
    // The expectations are (temporarily) stored in the cachedGradient
    double cachedValue;
    boolean cachedValueStale;
    boolean cachedGradientStale;
    int numLabels;
    int numFeatures;
    int defaultFeatureIndex;            // just for clarity
    FeatureSelection featureSelection;
    FeatureSelection[] perLabelFeatureSelection;

    public MaximizableTrainer (){}

    public MaximizableTrainer (InstanceList ilist, MCMaxEnt initialClassifier)
    {
      this.trainingList = ilist;
      Alphabet fd = ilist.getDataAlphabet();
      LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet();
      // Don't fd.stopGrowth, because someone might want to do feature induction
      ld.stopGrowth();
      // Add one feature for the "default feature".
      this.numLabels = ld.size();
      this.numFeatures = fd.size() + 1;
      this.defaultFeatureIndex = numFeatures-1;
      this.parameters = new double [numLabels * numFeatures];
      this.constraints = new double [numLabels * numFeatures];
      this.cachedGradient = new double [numLabels * numFeatures];
      Arrays.fill (parameters, 0.0);
      Arrays.fill (constraints, 0.0);
      Arrays.fill (cachedGradient, 0.0);
      this.featureSelection = ilist.getFeatureSelection();
      this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection();
      // Add the default feature index to the selection
      if (featureSelection != null)
        featureSelection.add (defaultFeatureIndex);
      if (perLabelFeatureSelection != null)
        for (int i = 0; i < perLabelFeatureSelection.length; i++)
          perLabelFeatureSelection[i].add (defaultFeatureIndex);
      // xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
      assert (featureSelection == null || perLabelFeatureSelection == null);
      if (initialClassifier != null) {

        this.theClassifier = initialClassifier;
        this.parameters = theClassifier.parameters;
        this.featureSelection = theClassifier.featureSelection;
        this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
        this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
        assert (initialClassifier.getInstancePipe() == ilist.getPipe());
      }
      else if (this.theClassifier == null) {
        this.theClassifier = new MCMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
      }
      cachedValueStale = true;
      cachedGradientStale = true;

      // Initialize the constraints
      logger.fine("Number of instances in training list = " + trainingList.size());
      for (Instance inst : trainingList) {
        double instanceWeight = trainingList.getInstanceWeight(inst);
        Labeling labeling = inst.getLabeling ();
        //logger.fine ("Instance "+ii+" labeling="+labeling);
        FeatureVector fv = (FeatureVector) inst.getData ();
        Alphabet fdict = fv.getAlphabet();
        assert (fv.getAlphabet() == fd);
        int li = labeling.getBestIndex();
        // The "2*" below is because there is one copy for the p(y|x)and another for the p(x|y).
        MatrixOps.rowPlusEquals (constraints, numFeatures, li, fv, 2*instanceWeight);
        // For the default feature, whose weight is 1.0
        assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";
        assert(!Double.isNaN(li)) : "bestIndex is NaN";
        boolean hasNaN = false;
        for(int i = 0; i < fv.numLocations(); i++) {
          if(Double.isNaN(fv.valueAtLocation(i))) {
            logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
            hasNaN = true;
          }
        }
        if(hasNaN)
          logger.info("NaN in instance: " + inst.getName());
        // Only p(y|x) uses the default feature; p(x|y) doesn't use it.  The default feature value is 1.0.
        constraints[li*numFeatures + defaultFeatureIndex] += instanceWeight;
      }
      //TestMaximizable.testValueAndGradientCurrentParameters (this);
    }

    public MCMaxEnt getClassifier () { return theClassifier; }

    public double getParameter (int index) {
      return parameters[index];
    }

    public void setParameter (int index, double v) {
      cachedValueStale = true;
      cachedGradientStale = true;
      parameters[index] = v;
    }

    public int getNumParameters() {
      return parameters.length;
    }

    public void getParameters (double[] buff) {
      if (buff == null || buff.length != parameters.length)
        buff = new double [parameters.length];
      System.arraycopy (parameters, 0, buff, 0, parameters.length);
    }

    public void setParameters (double [] buff) {
      assert (buff != null);
      cachedValueStale = true;
      cachedGradientStale = true;
      if (buff.length != parameters.length)
        parameters = new double[buff.length];
      System.arraycopy (buff, 0, parameters, 0, buff.length);
    }


    // log probability of the training labels
    public double getValue ()
    {
      if (cachedValueStale) {
        numGetValueCalls++;
        cachedValue = 0;
        // We'll store the expectation values in "cachedGradient" for now
        cachedGradientStale = true;
        java.util.Arrays.fill (cachedGradient, 0.0);
        // Incorporate likelihood of data
        double[] scores = new double[trainingList.getTargetAlphabet().size()];
        double value = 0.0;
        //System.out.println("I Now "+inputAlphabet.size()+" regular features.");
        Iterator<Instance> iter = trainingList.iterator();
        //int ii = 0;

        // Normalize the parameters to be per-class multinomials
        double probs[][] = new double[scores.length][numFeatures];
        double lprobs[][] = new double[scores.length][numFeatures];

        for (int si = 0; si < scores.length; si++) {
          double sum = 0, max = MatrixOps.max (parameters);
          for (int fi = 0; fi < numFeatures; fi++) {
            // TODO Strongly consider some smoothing here.  What happens when all parameters are zero?
            // Oh, this should be no problem, because exp(0) == 1.
            probs[si][fi] = Math.exp(parameters[si*numFeatures+fi] - max);
            sum += probs[si][fi];
          }
          assert (sum > 0);
          for (int fi = 0; fi < numFeatures; fi++) {
            probs[si][fi] /= sum;
            lprobs[si][fi] = Math.log(probs[si][fi]);
          }
        }

        while (iter.hasNext()) {
          Instance instance = iter.next();
          double instanceWeight = trainingList.getInstanceWeight(instance);
          Labeling labeling = instance.getLabeling ();
          //System.out.println("L Now "+inputAlphabet.size()+" regular features.");

          this.theClassifier.getClassificationScores (instance, scores);
          FeatureVector fv = (FeatureVector) instance.getData ();
          int li = labeling.getBestIndex();
          value = - (instanceWeight * Math.log (scores[li]));
          if(Double.isNaN(value)) {
            logger.fine ("MCMaxEntTrainer: Instance " + instance.getName() +
                         "has NaN value. log(scores)= " + Math.log(scores[li]) +
                         " scores = " + scores[li] +
                         " has instance weight = " + instanceWeight);

          }
          if (Double.isInfinite(value)) {
            logger.warning ("Instance "+instance.getSource() + " has infinite value; skipping value and gradient");
            cachedValue -= value;
            cachedValueStale = false;
            return -value;
//            continue;
          }
          cachedValue += value;
          // CPAL - this is a loop over classes and their scores
          //      - we compute the gradient by taking the dot product of the feature value
          //        and the probability of the class
          for (int si = 0; si < scores.length; si++) {
            if (scores[si] == 0) continue;
            assert (!Double.isInfinite(scores[si]));
            // CPAL - accumulating the current classifiers expectation of the feature
            // vector counts for this class label
            // Current classifier has expectation over class label, not over feature vector
            MatrixOps.rowPlusEquals (cachedGradient, numFeatures,
                                     si, fv, -instanceWeight * scores[si]);
            cachedGradient[numFeatures*si + defaultFeatureIndex] += (-instanceWeight * scores[si]);
          }

          // CPAL - if we wish to do multiconditional training we need another term for this accumulated
          //        expectation
          if (usingMultiConditionalTraining) {
            // need something analogous to this
            // this.theClassifier.getClassificationScores (instance, scores);
            // this.theClassifier.getFeatureDistributions (instance,
            // Note: li is the "label" for this instance

            // Get the sum of the feature vector
            // which is the number of counts for the document if we use that as input
            double Ncounts = MatrixOps.sum(fv);

            // CPAL - get the additional term for the value of our - log probability
            //      - this computation amounts to the dot product of the feature vector and the probability vector
            cachedValue -= (instanceWeight * fv.dotProduct(lprobs[li]));

            // CPAL - get the model expectation over features for the given class
            for (int fi = 0; fi < numFeatures; fi++) {

              //if(parameters[numFeatures*li + fi] != 0) {
              // MatrixOps.rowPlusEquals(cachedGradient, numFeatures,li,fv,))
              cachedGradient[numFeatures*li + fi] += (-instanceWeight * Ncounts * probs[li][fi]);
              //    }
            }

          }
        }
        //logger.info ("-Expectations:"); cachedGradient.print();
        // Incorporate prior on parameters
        if (usingHyperbolicPrior) {
          for (int li = 0; li < numLabels; li++)
            for (int fi = 0; fi < numFeatures; fi++)
              cachedValue += (hyperbolicPriorSlope / hyperbolicPriorSharpness
                              * Math.log (Maths.cosh (hyperbolicPriorSharpness * parameters[li *numFeatures + fi])));
        } else {
          for (int li = 0; li < numLabels; li++)
            for (int fi = 0; fi < numFeatures; fi++) {
              double param = parameters[li*numFeatures + fi];
              cachedValue += param * param / (2 * gaussianPriorVariance);
            }
        }
        cachedValue *= -1.0; // MAXIMIZE, NOT MINIMIZE
        cachedValueStale = false;
        progressLogger.info ("Value (loglikelihood) = "+cachedValue);
      }
      return cachedValue;
    }

    // CPAL first get value, then gradient

    public void getValueGradient (double [] buffer)
    {
      // Gradient is (constraint - expectation - parameters/gaussianPriorVariance)
      if (cachedGradientStale) {
        numGetValueGradientCalls++;
        if (cachedValueStale)
        // This will fill in the cachedGradient with the "-expectation"
          getValue ();
        // cachedGradient contains the negative expectations
        // expectations are model expectations and constraints are
        // empirical expectations
        MatrixOps.plusEquals (cachedGradient, constraints);
        // CPAL - we need a second copy of the constraints
        //      - actually, we only want this for the feature values
        //      - I've moved this up into getValue
        //if (usingMultiConditionalTraining){
        //    MatrixOps.plusEquals(cachedGradient, constraints);
        //}
        // Incorporate prior on parameters
        if (usingHyperbolicPrior) {
          throw new UnsupportedOperationException ("Hyperbolic prior not yet implemented.");
        }
        else {
          MatrixOps.plusEquals (cachedGradient, parameters,
                                -1.0 / gaussianPriorVariance);
        }

        // A parameter may be set to -infinity by an external user.
        // We set gradient to 0 because the parameter's value can
        // never change anyway and it will mess up future calculations
        // on the matrix, such as norm().
        MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);
        // Set to zero all the gradient dimensions that are not among the selected features
        if (perLabelFeatureSelection == null) {
          for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
            MatrixOps.rowSetAll (cachedGradient, numFeatures,
                                 labelIndex, 0.0, featureSelection, false);
        } else {
          for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
            MatrixOps.rowSetAll (cachedGradient, numFeatures,
                                 labelIndex, 0.0,
                                 perLabelFeatureSelection[labelIndex], false);
        }
        cachedGradientStale = false;
      }
      assert (buffer != null && buffer.length == parameters.length);
      System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
    }

    public double sumNegLogProb (double a, double b)
    {
      if (a == Double.POSITIVE_INFINITY && b == Double.POSITIVE_INFINITY)
        return Double.POSITIVE_INFINITY;
      else if (a > b)
        return b - Math.log (1 + Math.exp(b-a));
      else
        return a - Math.log (1 + Math.exp(a-b));
    }

  }

}
TOP

Related Classes of cc.mallet.classify.MCMaxEntTrainer$MaximizableTrainer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.