/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a>
*/
/**
Limited Memory BFGS, as described in Byrd, Nocedal, and Schnabel,
"Representations of Quasi-Newton Matrices and Their Use in Limited
Memory Methods"
*/
package cc.mallet.optimize;
import java.util.logging.*;
import java.util.LinkedList;
import cc.mallet.optimize.BackTrackLineSearch;
import cc.mallet.optimize.LineOptimizer;
import cc.mallet.optimize.Optimizable;
import cc.mallet.types.MatrixOps;
import cc.mallet.util.MalletLogger;
public class LimitedMemoryBFGS implements Optimizer {
private static Logger logger = MalletLogger.getLogger("edu.umass.cs.mallet.base.ml.maximize.LimitedMemoryBFGS");
boolean converged = false;
Optimizable.ByGradientValue optimizable;
final int maxIterations = 1000;
// xxx need a more principled stopping point
//final double tolerance = .0001;
private double tolerance = .0001;
final double gradientTolerance = .001;
final double eps = 1.0e-5;
// The number of corrections used in BFGS update
// ideally 3 <= m <= 7. Larger m means more cpu time, memory.
final int m = 4;
// Line search function
private LineOptimizer.ByGradient lineMaximizer;
public LimitedMemoryBFGS (Optimizable.ByGradientValue function) {
this.optimizable = function;
lineMaximizer = new BackTrackLineSearch (function);
}
public Optimizable getOptimizable () { return this.optimizable; }
public boolean isConverged () { return converged; }
/**
* Sets the LineOptimizer.ByGradient to use in L-BFGS optimization.
* @param lineOpt line optimizer for L-BFGS
*/
public void setLineOptimizer(LineOptimizer.ByGradient lineOpt) {
lineMaximizer = lineOpt;
}
// State of search
// g = gradient
// s = list of m previous "parameters" values
// y = list of m previous "g" values
// rho = intermediate calculation
double [] g, oldg, direction, parameters, oldParameters;
LinkedList s = new LinkedList();
LinkedList y = new LinkedList();
LinkedList rho = new LinkedList();
double [] alpha;
static double step = 1.0;
int iterations;
private OptimizerEvaluator.ByGradient eval = null;
// CPAL - added this
public void setTolerance(double newtol) {
this.tolerance = newtol;
}
public void setEvaluator (OptimizerEvaluator.ByGradient eval) {
this.eval = eval;
}
public int getIteration () {
return iterations;
}
public boolean optimize () {
return optimize (Integer.MAX_VALUE);
}
public boolean optimize (int numIterations) {
double initialValue = optimizable.getValue();
logger.fine("Entering L-BFGS.optimize(). Initial Value="+initialValue);
if (g == null) { //first time through
logger.fine("First time through L-BFGS");
iterations = 0;
s = new LinkedList();
y = new LinkedList();
rho = new LinkedList();
alpha = new double[m];
for (int i=0; i<m; i++) {
alpha[i] = 0.0;
}
parameters = new double[optimizable.getNumParameters()];
oldParameters = new double[optimizable.getNumParameters()];
g = new double[optimizable.getNumParameters()];
oldg = new double[optimizable.getNumParameters()];
direction = new double[optimizable.getNumParameters()];
optimizable.getParameters (parameters);
System.arraycopy (parameters, 0, oldParameters, 0, parameters.length);
optimizable.getValueGradient (g);
System.arraycopy (g, 0, oldg, 0, g.length);
System.arraycopy (g, 0, direction, 0, g.length);
if (MatrixOps.absNormalize (direction) == 0) {
logger.info("L-BFGS initial gradient is zero; saying converged");
g = null;
converged = true;
return true;
}
logger.fine ("direction.2norm: " + MatrixOps.twoNorm (direction));
MatrixOps.timesEquals(direction, 1.0 / MatrixOps.twoNorm(direction));
// make initial jump
logger.fine ("before initial jump: \ndirection.2norm: " +
MatrixOps.twoNorm (direction) + " \ngradient.2norm: " +
MatrixOps.twoNorm (g) + "\nparameters.2norm: " +
MatrixOps.twoNorm(parameters));
//TestMaximizable.testValueAndGradientInDirection (maxable, direction);
step = lineMaximizer.optimize(direction, step);
if (step == 0.0) {
// could not step in this direction.
// give up and say converged.
g = null; // reset search
step = 1.0;
throw new OptimizationException("Line search could not step in the current direction. " +
"(This is not necessarily cause for alarm. Sometimes this happens close to the maximum," +
" where the function may be very flat.)");
//return false;
}
optimizable.getParameters (parameters);
optimizable.getValueGradient(g);
logger.fine ("after initial jump: \ndirection.2norm: " +
MatrixOps.twoNorm (direction) + " \ngradient.2norm: "
+ MatrixOps.twoNorm (g));
}
for (int iterationCount = 0; iterationCount < numIterations; iterationCount++) {
double value = optimizable.getValue();
logger.fine("L-BFGS iteration="+iterationCount
+", value="+value+" g.twoNorm: "+MatrixOps.twoNorm(g)+
" oldg.twoNorm: "+MatrixOps.twoNorm(oldg));
// get difference between previous 2 gradients and parameters
double sy = 0.0;
double yy = 0.0;
for (int i=0; i < oldParameters.length; i++) {
// -inf - (-inf) = 0; inf - inf = 0
if (Double.isInfinite(parameters[i]) &&
Double.isInfinite(oldParameters[i]) &&
(parameters[i] * oldParameters[i] > 0)) {
oldParameters[i] = 0.0;
}
else {
oldParameters[i] = parameters[i] - oldParameters[i];
}
if (Double.isInfinite(g[i]) &&
Double.isInfinite(oldg[i]) &&
(g[i] * oldg[i] > 0)) {
oldg[i] = 0.0;
}
else {
oldg[i] = g[i] - oldg[i];
}
sy += oldParameters[i] * oldg[i]; // si * yi
yy += oldg[i] * oldg[i];
direction[i] = g[i];
}
if ( sy > 0 ) {
throw new InvalidOptimizableException ("sy = "+sy+" > 0" );
}
double gamma = sy / yy; // scaling factor
if ( gamma > 0 ) {
throw new InvalidOptimizableException ("gamma = "+gamma+" > 0" );
}
push (rho, 1.0/sy);
// These arrays are now the *differences* between parameters and gradient.
push (s, oldParameters);
push (y, oldg);
assert (s.size() == y.size()) : "s.size: " + s.size() + " y.size: " + y.size();
//
// This next section is where we calculate the new direction
//
// First work backwards, from the most recent difference vectors
for (int i = s.size() - 1; i >= 0; i--) {
alpha[i] = ((Double)rho.get(i)).doubleValue() * MatrixOps.dotProduct ( (double[])s.get(i), direction );
MatrixOps.plusEquals (direction, (double[])y.get(i), -1.0 * alpha[i]);
}
// Scale the direction by the ratio of s'y and y'y
MatrixOps.timesEquals(direction, gamma);
// Now work forwards, from the oldest to the newest difference vectors
for (int i = 0; i < y.size(); i++) {
double beta =
(((Double)rho.get(i)).doubleValue()) *
MatrixOps.dotProduct((double[])y.get(i), direction);
MatrixOps.plusEquals(direction, (double[])s.get(i), alpha[i] - beta);
}
// Move the current values to the "last iteration" buffers and negate the search direction
for (int i=0; i < oldg.length; i++) {
oldParameters[i] = parameters[i];
oldg[i] = g[i];
direction[i] *= -1.0;
}
logger.fine ("before linesearch: direction.gradient.dotprod: "+
MatrixOps.dotProduct(direction,g)+"\ndirection.2norm: " +
MatrixOps.twoNorm (direction) + "\nparameters.2norm: " +
MatrixOps.twoNorm(parameters));
// Test whether the gradient is ok
//TestMaximizable.testValueAndGradientInDirection (maxable, direction);
// Do a line search in the current direction
step = lineMaximizer.optimize(direction, step);
if (step == 0.0) { // could not step in this direction.
g = null; // reset search
step = 1.0;
// xxx Temporary test; passed OK
// TestMaximizable.testValueAndGradientInDirection (maxable, direction);
throw new OptimizationException("Line search could not step in the current direction. " +
"(This is not necessarily cause for alarm. Sometimes this happens close to the maximum," +
" where the function may be very flat.)");
// return false;
}
optimizable.getParameters (parameters);
optimizable.getValueGradient(g);
logger.fine ("after linesearch: direction.2norm: " +
MatrixOps.twoNorm (direction));
double newValue = optimizable.getValue();
// Test for terminations
if (2.0 * Math.abs(newValue-value) <= tolerance * (Math.abs(newValue) + Math.abs(value) + eps)) {
logger.info("Exiting L-BFGS on termination #1:\nvalue difference below tolerance (oldValue: " + value + " newValue: " + newValue);
converged = true;
return true;
}
double gg = MatrixOps.twoNorm(g);
if (gg < gradientTolerance) {
logger.fine("Exiting L-BFGS on termination #2: \ngradient="+gg+" < "+gradientTolerance);
converged = true;
return true;
}
if (gg == 0.0) {
logger.fine("Exiting L-BFGS on termination #3: \ngradient==0.0");
converged = true;
return true;
}
logger.fine("Gradient = "+gg);
iterations++;
if (iterations > maxIterations) {
System.err.println("Too many iterations in L-BFGS.java. Continuing with current parameters.");
converged = true;
return true;
//throw new IllegalStateException ("Too many iterations.");
}
//end of iteration. call evaluator
if (eval != null && ! eval.evaluate (optimizable, iterationCount)) {
logger.fine ("Exiting L-BFGS on termination #4: evaluator returned false.");
converged = true;
return false;
}
}
return false;
}
/** Resets the previous gradients and values that are used to
* approximate the Hessian. NOTE - If the {@link Optimizable} object
* is modified externally, this method should be called to avoid
* IllegalStateExceptions. */
public void reset () {
g = null;
}
/**
* Pushes a new object onto the queue l
* @param l linked list queue of Matrix obj's
* @param toadd matrix to push onto queue
*/
private void push(LinkedList l, double[] toadd) {
assert(l.size() <= m);
if (l.size() == m) {
// remove oldest matrix and add newest to end of list.
// to make this more efficient, actually overwrite
// memory of oldest matrix
// this overwrites the oldest matrix
double[] last = (double[]) l.get(0);
System.arraycopy(toadd, 0, last, 0, toadd.length);
Object ptr = last;
// this readjusts the pointers in the list
for (int i=0; i<l.size()-1; i++) {
l.set(i, (double[])l.get(i+1));
}
l.set(m-1, ptr);
}
else {
double [] newArray = new double[toadd.length];
System.arraycopy (toadd, 0, newArray, 0, toadd.length);
l.addLast(newArray);
}
}
/**
* Pushes a new object onto the queue l
* @param l linked list queue of Double obj's
* @param toadd double value to push onto queue
*/
private void push(LinkedList l, double toadd) {
assert(l.size() <= m);
if (l.size() == m) { //pop old double and add new
l.removeFirst();
l.addLast(new Double(toadd));
}
else {
l.addLast(new Double(toadd));
}
}
}