Package cc.mallet.topics

Source Code of cc.mallet.topics.DMROptimizable

package cc.mallet.topics;

/** This class implements the value and gradient functions for
*   Dirichlet-multinomial Regression. See Guimaraes and Lindrooth,
*   for a general introduction to DMR,
*   and Mimno and McCallum (UAI, 2008) for an application to
*   multinomial mixture models.
*/

import cc.mallet.optimize.Optimizable;
import cc.mallet.classify.MaxEnt;

import cc.mallet.types.InstanceList;
import cc.mallet.types.Instance;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Dirichlet;
import cc.mallet.types.MatrixOps;

import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;

import java.util.logging.*;
import java.util.*;

import java.text.NumberFormat;
import java.text.DecimalFormat;

import gnu.trove.TIntIntHashMap;

public class DMROptimizable implements Optimizable.ByGradientValue {

  private static Logger logger = MalletLogger.getLogger(DMROptimizable.class.getName());
  private static Logger progressLogger = MalletProgressMessageLogger.getLogger(DMROptimizable.class.getName()+"-pl");

  MaxEnt classifier;
  InstanceList trainingList;

  int numGetValueCalls = 0;
    int numGetValueGradientCalls = 0;
    int numIterations = Integer.MAX_VALUE;

  NumberFormat formatter = null;

  static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1;
  static final double DEFAULT_LARGE_GAUSSIAN_PRIOR_VARIANCE = 100;
    static final double DEFAULT_GAUSSIAN_PRIOR_MEAN = 0.0;
       
    double gaussianPriorMean = DEFAULT_GAUSSIAN_PRIOR_MEAN;
    double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;

  // Allowing the default feature (the base level) to
  //  fluctuate more freely than the feature parameters leads
  //  to much better results.
    double defaultFeatureGaussianPriorVariance = DEFAULT_LARGE_GAUSSIAN_PRIOR_VARIANCE;

  double[] parameters;
  double[] cachedGradient;

  double cachedValue;
  boolean cachedValueStale;
  boolean cachedGradientStale;
  int numLabels;
  int numFeatures;
  int defaultFeatureIndex;

  public DMROptimizable () {}

  public DMROptimizable (InstanceList instances, MaxEnt initialClassifier) {

    this.trainingList = instances;
    Alphabet alphabet = instances.getDataAlphabet();
    Alphabet labelAlphabet = instances.getTargetAlphabet();

    this.numLabels = labelAlphabet.size();

    // Add one feature for the "default feature".
    this.numFeatures = alphabet.size() + 1; // add a spot for the intercept term
           
    //System.out.println("num features: " + numFeatures + " numLabels: " + numLabels);

    this.defaultFeatureIndex = numFeatures - 1;

    this.parameters = new double [numLabels * numFeatures];

    //this.constraints = new double [numLabels * numFeatures];
    this.cachedGradient = new double [numLabels * numFeatures];

    if (initialClassifier != null) {
      this.classifier = initialClassifier;
      this.parameters = classifier.getParameters();
      this.defaultFeatureIndex = classifier.getDefaultFeatureIndex();
      assert (initialClassifier.getInstancePipe() == instances.getPipe());
    }
    else if (this.classifier == null) {
      this.classifier =
        new MaxEnt (instances.getPipe(), parameters);
    }

    formatter = new DecimalFormat("0.###E0");

    cachedValueStale = true;
    cachedGradientStale = true;

    // Initialize the constraints

    logger.fine("Number of instances in training list = " + trainingList.size());

    for (Instance instance : trainingList) {
      FeatureVector multinomialValues = (FeatureVector) instance.getTarget();

      if (multinomialValues == null)
        continue;

      FeatureVector features = (FeatureVector) instance.getData();
      assert (features.getAlphabet() == alphabet);

      boolean hasNaN = false;

      for (int i = 0; i < features.numLocations(); i++) {
        if (Double.isNaN(features.valueAtLocation(i))) {
          logger.info("NaN for feature " + alphabet.lookupObject(features.indexAtLocation(i)).toString());
          hasNaN = true;
        }
      }

      if (hasNaN) {
        logger.info("NaN in instance: " + instance.getName());
      }

    }

    //TestMaximizable.testValueAndGradientCurrentParameters (this);
  }
 
  /** Set the variance for the default features (aka intercept terms), generally
   *   larger than the variance for the regular features.
   */
  public void setInterceptGaussianPriorVariance(double sigmaSquared) {
    this.defaultFeatureGaussianPriorVariance = sigmaSquared;
  }

  /** Set the variance for regular (non default) features, generally
   *   smaller than the variance for the default features.
   */
  public void setRegularGaussianPriorVariance(double sigmaSquared) {
    this.gaussianPriorVariance = sigmaSquared;
  }

  public MaxEnt getClassifier () { return classifier; }
               
  public double getParameter (int index) {
    return parameters[index];
  }
               
  public void setParameter (int index, double v) {
    cachedValueStale = true;
    cachedGradientStale = true;
    parameters[index] = v;
  }
               
  public int getNumParameters() {
    return parameters.length;
  }
               
  public void getParameters (double[] buff) {
    if (buff == null || buff.length != parameters.length) {
      buff = new double [parameters.length];
    }
    System.arraycopy (parameters, 0, buff, 0, parameters.length);
  }
       
  public void setParameters (double [] buff) {
    assert (buff != null);
    cachedValueStale = true;
    cachedGradientStale = true;
    if (buff.length != parameters.length)
      parameters = new double[buff.length];
    System.arraycopy (buff, 0, parameters, 0, buff.length);
  }

  /** The log probability of the observed count vectors given the features. */
  public double getValue () {

    if (! cachedValueStale) { return cachedValue; }

    numGetValueCalls++;
    cachedValue = 0;

    // Incorporate likelihood of data
    double[] scores = new double[ trainingList.getTargetAlphabet().size() ];
    double value = 0.0;

    int instanceIndex = 0;
   
    for (Instance instance: trainingList) {

      FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
      if (multinomialValues == null) { continue; }

      //System.out.println("L Now "+inputAlphabet.size()+" regular features.");
               
      // Get the predicted probability of each class
      //   under the current model parameters
      this.classifier.getUnnormalizedClassificationScores(instance, scores);

      double sumScores = 0.0;

      // Exponentiate the scores
      for (int i=0; i<scores.length; i++) {
        // Due to underflow, it's very likely that some of these scores will be 0.0.
        scores[i] = Math.exp(scores[i]);
        sumScores += scores[i];
      }

      FeatureVector features = (FeatureVector) instance.getData();

      // This is really an int, but since FeatureVectors are defined as doubles,
      //  avoid casting.
      double totalLength = 0;

      for (int i = 0; i < multinomialValues.numLocations(); i++) {
        int label = multinomialValues.indexAtLocation(i);
        double count = multinomialValues.valueAtLocation(i);
        value += (Dirichlet.logGammaStirling(scores[label] + count) -
              Dirichlet.logGammaStirling(scores[label]));
        totalLength += count;
      }

      value -= (Dirichlet.logGammaStirling(sumScores + totalLength) -
            Dirichlet.logGammaStirling(sumScores));
                   
      // Error Checking:
               
      if (Double.isNaN(value)) {
        logger.fine ("DCMMaxEntTrainer: Instance " + instance.getName() +
               "has NaN value.");

        for (int label: multinomialValues.getIndices()) {
          logger.fine ("log(scores)= " + Math.log(scores[label]) +
                 " scores = " + scores[label]);
        }
      }

      if (Double.isInfinite(value)) {
        logger.warning ("Instance " + instance.getSource() +
                " has infinite value; skipping value and gradient");
        cachedValue -= value;
        cachedValueStale = false;
        return -value;
      }

      //System.out.println(value);

      cachedValue += value;
               
      instanceIndex++;
    }

    // Incorporate prior on parameters

    double prior = 0;

    // The log of a gaussian prior is x^2 / -2sigma^2

    for (int label = 0; label < numLabels; label++) {
      for (int feature = 0; feature < numFeatures - 1; feature++) {
        double param = parameters[label*numFeatures + feature];
        prior -= (param - gaussianPriorMean) * (param - gaussianPriorMean) / (2 * gaussianPriorVariance);
      }
      double param = parameters[label*numFeatures + defaultFeatureIndex];
      prior -= (param - gaussianPriorMean) * (param - gaussianPriorMean) /
        (2 * defaultFeatureGaussianPriorVariance);
    }

    double labelProbability = cachedValue;
    cachedValue += prior;
    cachedValueStale = false;
    progressLogger.info ("Value (likelihood=" + formatter.format(labelProbability) +
               " prior=" + formatter.format(prior) +
               ") = " + formatter.format(cachedValue));

    return cachedValue;
  }

  public void getValueGradient (double [] buffer) {

    MatrixOps.setAll (cachedGradient, 0.0);

    // Incorporate likelihood of data
    double[] scores = new double[ trainingList.getTargetAlphabet().size() ];

    int instanceIndex = 0;

    for (Instance instance: trainingList) {

      FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
      if (multinomialValues == null) { continue; }

      // Get the predicted probability of each class
      //   under the current model parameters
      this.classifier.getUnnormalizedClassificationScores(instance, scores);

      double sumScores = 0.0;

      // Exponentiate the scores
      for (int i=0; i<scores.length; i++) {
        // Due to underflow, it's very likely that some of these scores will be 0.0.
        scores[i] = Math.exp(scores[i]);
        sumScores += scores[i];
      }

      FeatureVector features = (FeatureVector) instance.getData();

      double totalLength = 0;

      for (double count : multinomialValues.getValues()) {
        totalLength += count;
      }
     
      double digammaDifferenceForSums =
        Dirichlet.digamma(sumScores + totalLength) -
        Dirichlet.digamma(sumScores);
     
      for (int loc = 0; loc < features.numLocations(); loc++) {
        int index = features.indexAtLocation(loc);
        double value = features.valueAtLocation(loc);
                   
        if (value == 0.0) { continue; }

        // In a FeatureVector, there's no easy way to say "do you know
        //   about this id?" so I've broken this into two for loops,
        //  one for all labels, the other for just the non-zero ones.

        for (int label=0; label<numLabels; label++) {
          cachedGradient[label * numFeatures + index] -=
            value * scores[label] * digammaDifferenceForSums;
        }

        for (int labelLoc = 0; labelLoc <multinomialValues.numLocations(); labelLoc++) {
          int label = multinomialValues.indexAtLocation(labelLoc);
          double count = multinomialValues.valueAtLocation(labelLoc);

          double diff = 0.0;
                           
          if (count < 20) {
            for (int i=0; i < count; i++) {
              diff += 1 / (scores[label] + i);
            }
          }
          else {
            diff = Dirichlet.digamma(scores[label] + count) -
              Dirichlet.digamma(scores[label]);
          }

          cachedGradient[label * numFeatures + index] +=
            value * scores[label] * diff;

        }
      }
      // Now add the default feature

      for (int label=0; label<numLabels; label++) {
        cachedGradient[label * numFeatures + defaultFeatureIndex] -=
          scores[label] * digammaDifferenceForSums;
      }


            for(int labelLoc = 0; labelLoc <multinomialValues.numLocations(); labelLoc++) {
        int label = multinomialValues.indexAtLocation(labelLoc);
                double count = multinomialValues.valueAtLocation(labelLoc);
       
        double diff = 0.0;

        if (count < 20) {
          for (int i=0; i < count; i++) {
            diff += 1 / (scores[label] + i);
          }
        }
        else {
          diff = Dirichlet.digamma(scores[label] + count) -
            Dirichlet.digamma(scores[label]);
        }

        cachedGradient[label * numFeatures + defaultFeatureIndex] +=
          scores[label] * diff;
                   

      }

    }

    numGetValueGradientCalls++;
           
    for (int label = 0; label < numLabels; label++) {
      for (int feature = 0; feature < numFeatures - 1; feature++) {
        double param = parameters[label*numFeatures + feature];

        cachedGradient[label * numFeatures + feature] -=
          (param - gaussianPriorMean) / gaussianPriorVariance;
      }

      double param = parameters[label*numFeatures + defaultFeatureIndex];
               
      cachedGradient[label * numFeatures + defaultFeatureIndex] -=
        (param - gaussianPriorMean) / defaultFeatureGaussianPriorVariance;
    }

    // A parameter may be set to -infinity by an external user.
    // We set gradient to 0 because the parameter's value can
    // never change anyway and it will mess up future calculations
    // on the matrix, such as norm().
    MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);

    assert (buffer != null && buffer.length == parameters.length);
    System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
    //System.out.println ("DCMMaxEntTrainer gradient infinity norm = "+MatrixOps.infinityNorm(cachedGradient));
  }
}
TOP

Related Classes of cc.mallet.topics.DMROptimizable

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.