package de.jungblut.math.minimize;
import gnu.trove.list.array.TDoubleArrayList;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.util.FastMath;
import de.jungblut.math.DoubleVector;
import de.jungblut.math.dense.DenseDoubleVector;
* Java translation of C++ code of
* "Orthant-Wise Limited-memory Quasi-Newton Optimizer for L1-regularized Objectives"
* (@see <a href=
* "
* c9d498a03/"></a>). <br/>
* <br/>
* The Orthant-Wise Limited-memory Quasi-Newton algorithm (OWL-QN) is a
* numerical optimization procedure for finding the optimum of an objective of
* the form {smooth function} plus {L1-norm of the parameters}. It has been used
* for training log-linear models (such as logistic regression) with
* L1-regularization. The algorithm is described in
* "Scalable training of L1-regularized log-linear models" by Galen Andrew and
* Jianfeng Gao. <br/>
* <br/>
* Orthant-Wise Limited-memory Quasi-Newton algorithm minimizes functions of the
* form<br/>
* <br/>
* f(w) = loss(w) + C |w|_1<br/>
* <br/>
* where loss is an arbitrary differentiable convex loss function, and |w|_1 is
* the L1 norm of the weight (parameter) vector. It is based on the L-BFGS
* Quasi-Newton algorithm, with modifications to deal with the fact that the L1
* norm is not differentiable. The algorithm is very fast, and capable of
* scaling efficiently to problems with millions of parameters.<br/>
* <br/>
* This is a straight forward translation, with the use of my math library.
* @author thomas.jungblut
public class OWLQN extends AbstractMinimizer {
private static final Log LOG = LogFactory.getLog(OWLQN.class);
private DoubleVector x, grad, newX, newGrad, dir;
private DoubleVector steepestDescDir;
private double[] alphas;
private TDoubleArrayList roList;
private TDoubleArrayList costs;
private ArrayList<DoubleVector> sList, yList;
private double value;
private int m = 10;
private double l1weight = 0;
private double tol = 1e-4;
private boolean gradCheck = false;
public DoubleVector minimize(CostFunction f, DoubleVector theta,
int maxIterations, boolean verbose) {
DoubleVector zeros = new DenseDoubleVector(theta.getDimension());
this.x = theta;
this.grad = zeros;
this.newX = theta.deepCopy();
this.newGrad = zeros;
this.dir = zeros;
this.steepestDescDir = newGrad;
this.alphas = new double[m];
this.roList = new TDoubleArrayList(m);
this.costs = new TDoubleArrayList(m);
this.sList = new ArrayList<>();
this.yList = new ArrayList<>();
this.value = evaluateL1(f);
this.grad = newGrad;
for (int i = 0; i < maxIterations; i++) {
updateDir(f, verbose);
boolean continueIterations = backTrackingLineSearch(i, f);
// also break on too small average improvement over 5 iterations
if (costs.size() > 5) {
double first = costs.get(0);
while (costs.size() > 5) {
double avgImprovement = (first - value) / costs.size();
double perc = avgImprovement / Math.abs(value);
if (perc < tol) {
// break if we can't get any improvement
if (!continueIterations) {
if (verbose) {"Iteration " + i + " | Cost: " + value);
// cleanup all the stuff in this class
x = null;
grad = null;
newGrad = null;
dir = null;
steepestDescDir = null;
alphas = null;
roList = null;
costs = null;
sList = null;
yList = null;
return newX;
private void updateDir(CostFunction f, boolean verbose) {
if (gradCheck) {
private void testDirectionDerivation(CostFunction f) {
double dirNorm = FastMath.sqrt(;
// if dirNorm is 0, we probably hit the minimum. So we have no gradient to
// descent to.
if (dirNorm != 0d) {
double eps = 1.05e-8 / dirNorm;
double val2 = evaluateL1(f);
double numDeriv = (val2 - value) / eps;
double deriv = directionDerivation();"GradCheck: expected= " + numDeriv + " vs. " + deriv
+ "! AbsDiff= " + Math.abs(numDeriv - deriv));
private void fixDirectionSigns() {
if (l1weight > 0) {
for (int i = 0; i < dir.getDimension(); i++) {
if (dir.get(i) * steepestDescDir.get(i) <= 0) {
dir.set(i, 0);
private void mapDirectionByInverseHessian() {
int count = sList.size();
if (count != 0) {
for (int i = count - 1; i >= 0; i--) {
alphas[i] = -sList.get(i).dot(dir) / roList.get(i);
addMult(dir, yList.get(i), alphas[i]);
DoubleVector lastY = yList.get(count - 1);
double yDotY =;
double scalar = roList.get(count - 1) / yDotY;
scale(dir, scalar);
for (int i = 0; i < count; i++) {
double beta = yList.get(i).dot(dir) / roList.get(i);
addMult(dir, sList.get(i), -alphas[i] - beta);
private void makeSteepestDescDir() {
if (l1weight == 0) {
scaleInto(dir, grad, -1);
} else {
for (int i = 0; i < dir.getDimension(); i++) {
if (x.get(i) < 0) {
dir.set(i, -grad.get(i) + l1weight);
} else if (x.get(i) > 0) {
dir.set(i, -grad.get(i) - l1weight);
} else {
if (grad.get(i) < -l1weight) {
dir.set(i, -grad.get(i) - l1weight);
} else if (grad.get(i) > l1weight) {
dir.set(i, -grad.get(i) + l1weight);
} else {
dir.set(i, 0);
steepestDescDir = dir;
private boolean backTrackingLineSearch(int iter, CostFunction f) {
double origDirDeriv = directionDerivation();
// if a non-descent direction is chosen, the line search will break anyway,
// so throw here
// The most likely reason for this is a bug in your function's gradient
// computation
if (origDirDeriv > 0d) {
throw new RuntimeException(
"L-BFGS chose a non-descent direction: check your gradient!");
} else if (origDirDeriv == 0d || Double.isNaN(origDirDeriv)) {"L-BFGS apparently found the minimum. No direction to descent anymore.");
return false;
double alpha = 1.0;
double backoff = 0.5;
if (iter == 0) {
double normDir = FastMath.sqrt(;
alpha = (1 / normDir);
backoff = 0.1;
double c1 = 1e-4;
double oldValue = value;
while (true) {
value = evaluateL1(f);
if (Double.isNaN(value) || value <= oldValue + c1 * origDirDeriv * alpha) {
alpha *= backoff;
return true;
private void getNextPoint(double alpha) {
addMultInto(newX, x, dir, alpha);
if (l1weight > 0) {
for (int i = 0; i < x.getDimension(); i++) {
if (x.get(i) * newX.get(i) < 0.0) {
newX.set(i, 0d);
private void addMultInto(DoubleVector a, DoubleVector b, DoubleVector c,
double d) {
for (int i = 0; i < a.getDimension(); i++) {
a.set(i, b.get(i) + c.get(i) * d);
private void addMult(DoubleVector a, DoubleVector b, double c) {
for (int i = 0; i < a.getDimension(); i++) {
a.set(i, a.get(i) + b.get(i) * c);
private void scale(DoubleVector a, double b) {
for (int i = 0; i < a.getDimension(); i++) {
a.set(i, a.get(i) * b);
void scaleInto(DoubleVector a, DoubleVector b, double c) {
for (int i = 0; i < a.getDimension(); i++) {
a.set(i, b.get(i) * c);
private double directionDerivation() {
if (l1weight == 0.0) {
} else {
double val = 0.0;
for (int i = 0; i < dir.getDimension(); i++) {
if (dir.get(i) != 0) {
if (x.get(i) < 0) {
val += dir.get(i) * (grad.get(i) - l1weight);
} else if (x.get(i) > 0) {
val += dir.get(i) * (grad.get(i) + l1weight);
} else if (dir.get(i) < 0) {
val += dir.get(i) * (grad.get(i) - l1weight);
} else if (dir.get(i) > 0) {
val += dir.get(i) * (grad.get(i) + l1weight);
return val;
private double evaluateL1(CostFunction f) {
CostGradientTuple evaluateCost = f.evaluateCost(newX);
newGrad = evaluateCost.getGradient();
double val = evaluateCost.getCost();
if (l1weight > 0) {
for (int i = 0; i < newGrad.getDimension(); i++) {
val += Math.abs(newX.get(i)) * l1weight;
return val;
private void shift() {
DoubleVector nextS = null;
DoubleVector nextY = null;
int listSize = sList.size();
if (listSize < m) {
nextS = new DenseDoubleVector(x.getDimension());
nextY = new DenseDoubleVector(x.getDimension());
if (nextS == null) {
nextS = sList.get(0);
nextY = yList.get(0);
addMultInto(nextS, newX, x, -1);
addMultInto(nextY, newGrad, grad, -1);
double ro =;
DoubleVector tmpNewX = newX.deepCopy();
newX = x.deepCopy();
x = tmpNewX;
DoubleVector tmpNewGrad = newGrad.deepCopy();
newGrad = grad.deepCopy();
grad = tmpNewGrad;
* Set to true this will check the gradients every iteration and print out if
* it aligns with the numerical gradient.
public OWLQN doGradChecks() {
this.gradCheck = true;
return this;
* The amount of directions and gradients to keep, this is the "limited" part
* of L-BFGS. It defaults to 10.
public OWLQN setM(int m) {
this.m = m;
return this;
* This implementation also supports l1 weight adjustment (without the
* costfunction knowing about it). This is turned off by default.
public OWLQN setL1Weight(double l1weight) {
this.l1weight = l1weight;
return this;
* The breaking tolerance over a window of five iterations. This defaults to
* 1e-4.
public OWLQN setTolerance(double tol) {
this.tol = tol;
return this;
* Minimizes the given cost function with L-BFGS.
* @param f the costfunction to minimize.
* @param theta the initial weights.
* @param maxIterations the maximum amount of iterations.
* @param verbose true if progress output shall be printed.
* @return the optimized set of parameters for the cost function.
public static DoubleVector minimizeFunction(CostFunction f,
DoubleVector theta, int maxIterations, boolean verbose) {
return new OWLQN().minimize(f, theta, maxIterations, verbose);