/*
* Encog(tm) Core v3.0 - Java Version
* http://www.heatonresearch.com/encog/
* http://code.google.com/p/encog-java/
* Copyright 2008-2011 Heaton Research, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For more information on Heaton Research copyrights, licenses
* and trademarks visit:
* http://www.heatonresearch.com/copyright
*/
package org.encog.neural.networks.training.lma;
import org.encog.mathutil.EncogMath;
import org.encog.mathutil.matrices.Matrix;
import org.encog.mathutil.matrices.decomposition.LUDecomposition;
import org.encog.ml.MLMethod;
import org.encog.ml.TrainingImplementationType;
import org.encog.ml.data.MLData;
import org.encog.ml.data.MLDataPair;
import org.encog.ml.data.MLDataSet;
import org.encog.ml.data.basic.BasicMLData;
import org.encog.ml.data.basic.BasicMLDataPair;
import org.encog.ml.train.BasicTraining;
import org.encog.neural.networks.BasicNetwork;
import org.encog.neural.networks.structure.NetworkCODEC;
import org.encog.neural.networks.training.TrainingError;
import org.encog.neural.networks.training.propagation.TrainingContinuation;
import org.encog.util.validate.ValidateNetwork;
/**
* Trains a neural network using a Levenberg Marquardt algorithm (LMA). This
* training technique is based on the mathematical technique of the same name.
*
* http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
*
* The LMA training technique has some important limitations that you should be
* aware of, before using it.
*
* Only neural networks that have a single output neuron can be used with this
* training technique.
*
* The entire training set must be loaded into memory. Because of this an
* Indexable training set must be used.
*
* However, despite these limitations, the LMA training technique can be a very
* effective training method.
*
* References: - http://www-alg.ist.hokudai.ac.jp/~jan/alpha.pdf -
* http://www.inference.phy.cam.ac.uk/mackay/Bayes_FAQ.html
* ----------------------------------------------------------------
*
* This implementation of the Levenberg Marquardt algorithm is based heavily on code
* published in an article by Cesar Roberto de Souza. The original article can be
* found here:
*
* http://crsouza.blogspot.com/2009/11/neural-network-learning-by-levenberg_18.html
*
* Portions of this class are under the following copyright/license.
* Copyright 2009 by Cesar Roberto de Souza, Released under the LGPL.
*
*/
public class LevenbergMarquardtTraining extends BasicTraining {
/**
* The amount to scale the lambda by.
*/
public static final double SCALE_LAMBDA = 10.0;
/**
* The max amount for the LAMBDA.
*/
public static final double LAMBDA_MAX = 1e25;
/**
* Number of points for finite difference.
*/
public final static int NUM_POINTS = 3;
/**
* Return the sum of the diagonal.
*
* @param m
* The matrix to sum.
* @return The trace of the matrix.
*/
public static double trace(final double[][] m) {
double result = 0.0;
for (int i = 0; i < m.length; i++) {
result += m[i][i];
}
return result;
}
/**
* The network that is to be trained.
*/
private final BasicNetwork network;
/**
* The training set that we are using to train.
*/
private final MLDataSet indexableTraining;
/**
* The training set length.
*/
private final int trainingLength;
/**
* The number of "parameters" in the LMA algorithm. The parameters are what
* the LMA adjusts to achieve the desired outcome. For neural network
* optimization, the parameters are the weights and bias values.
*/
private final int parametersLength;
/**
* The neural network weights and bias values.
*/
private double[] weights;
/**
* The "hessian" matrix, used by the LMA.
*/
private final Matrix hessianMatrix;
/**
* The jacobian, a matrix of partial derivatives.
*/
private final double[][] jacobian;
/**
* The "hessian" matrix as a 2d array.
*/
private final double[][] hessian;
/**
* The beta is multiplied by the sum squared of the errors.
*/
private final double beta;
/**
* The lambda, or damping factor. This is increased until a desirable
* adjustment is found.
*/
private double lambda;
/**
* The calculated gradients.
*/
private final double[] gradient;
/**
* The diagonal of the hessian.
*/
private final double[] diagonal;
/**
* The amount to change the weights by.
*/
private double[] deltas;
/**
* Gamma, used for Bayesian regularization.
*/
private double gamma;
/**
* The training elements.
*/
private final MLDataPair pair;
/**
* The derivative step size for finite difference.
*/
private final double[] derivativeStepSize;
/**
* The coefficients for finite difference.
*/
private final double[][][] differentialCoefficients;
/**
* The derivitive step size.
*/
private final double DERIV_STEP = 1e-2;
/**
* The errors.
*/
private final double[] errors;
/**
* Construct the LMA object.
*
* @param network
* The network to train. Must have a single output neuron.
* @param training
* The training data to use. Must be indexable.
*/
public LevenbergMarquardtTraining(final BasicNetwork network,
final MLDataSet training) {
super(TrainingImplementationType.Iterative);
ValidateNetwork.validateMethodToData(network, training);
if (network.getOutputCount() != 1) {
throw new TrainingError(
"Levenberg Marquardt requires an output layer with a single neuron.");
}
setTraining(training);
this.indexableTraining = getTraining();
this.network = network;
this.trainingLength = (int) this.indexableTraining.getRecordCount();
this.parametersLength = this.network.getStructure().calculateSize();
this.hessianMatrix = new Matrix(this.parametersLength,
this.parametersLength);
this.hessian = this.hessianMatrix.getData();
this.beta = 1.0;
this.lambda = 0.1;
this.deltas = new double[this.parametersLength];
this.gradient = new double[this.parametersLength];
this.diagonal = new double[this.parametersLength];
this.errors = new double[this.trainingLength];
this.jacobian = new double[this.trainingLength][this.parametersLength];
final BasicMLData input = new BasicMLData(
this.indexableTraining.getInputSize());
final BasicMLData ideal = new BasicMLData(
this.indexableTraining.getIdealSize());
this.pair = new BasicMLDataPair(input, ideal);
// setup coefficient arrays for finite difference method
// create differential coefficient arrays
this.differentialCoefficients = CreateCoefficients(LevenbergMarquardtTraining.NUM_POINTS);
this.derivativeStepSize = new double[this.parametersLength];
// initialize arrays
for (int i = 0; i < this.parametersLength; i++) {
this.derivativeStepSize[i] = this.DERIV_STEP;
}
}
/**
* Calculate the Hessian matrix.
*/
public void calculateHessian() {
for (int i = 0; i < this.parametersLength; i++) {
// Compute Jacobian Matrix Errors
double s = 0.0;
for (int j = 0; j < this.trainingLength; j++) {
s += this.jacobian[j][i] * this.errors[j];
}
this.gradient[i] = s;
// Compute Quasi-Hessian Matrix using Jacobian (H = J'J)
for (int j = 0; j < this.parametersLength; j++) {
double c = 0.0;
for (int k = 0; k < this.trainingLength; k++) {
c += this.jacobian[k][i] * this.jacobian[k][j];
}
this.hessian[i][j] = this.beta * c;
}
}
for (int i = 0; i < this.parametersLength; i++) {
this.diagonal[i] = this.hessian[i][i];
}
}
/**
* Calculate the sum squared of the weights.
*
* @return The sum squared of the weights.
*/
private double calculateSumOfSquaredWeights() {
double result = 0;
for (final double weight : this.weights) {
result += weight * weight;
}
return result / 2.0;
}
@Override
public boolean canContinue() {
return false;
}
private double computeDerivative(final MLData inputData, final int layer,
final int neuron, final int weight, final double[] stepSize,
final double networkOutput, final int jj) {
final int numPoints = this.differentialCoefficients.length;
double ret = 0.0;
double originalValue;
// Saves a copy of the original value in the neuron
originalValue = this.network.getWeight(layer, weight, neuron);
final double[] points = new double[numPoints];
if (originalValue != 0.0) {
stepSize[jj] = this.DERIV_STEP * Math.abs(originalValue);
} else {
stepSize[jj] = this.DERIV_STEP;
}
final int centerPoint = (numPoints - 1) / 2;
for (int i = 0; i < numPoints; i++) {
if (i != centerPoint) {
final double newValue = originalValue + ((i - centerPoint))
* stepSize[jj];
this.network.setWeight(layer, weight, neuron, newValue);
final MLData output = this.network.compute(inputData);
points[i] = output.getData(0);
} else {
points[i] = networkOutput;
}
}
ret = 0.0;
for (int i = 0; i < this.differentialCoefficients.length; i++) {
ret += this.differentialCoefficients[centerPoint][1][i] * points[i];
}
ret /= Math.pow(stepSize[jj], 1);
// Changes back the modified value
this.network.setWeight(layer, weight, neuron, originalValue);
return ret;
}
private double[][][] CreateCoefficients(final int points) {
final double[][][] coefficients = new double[points][points][points];
for (int i = 0; i < points; i++) {
final Matrix delts = new Matrix(points, points);
final double[][] ptr = delts.getData();
for (int j = 0; j < points; j++) {
final double delt = (j - i);
double hterm = 1.0;
for (int k = 0; k < points; k++) {
ptr[j][k] = hterm / EncogMath.factorial(k);
hterm *= delt;
}
}
final Matrix invMatrix = delts.inverse();
final double dNumPointsFactorial = EncogMath.factorial(points);
for (int j = 0; j < points; j++) {
for (int k = 0; k < points; k++) {
coefficients[i][j][k] = (Math
.round(invMatrix.getData()[j][k]
* dNumPointsFactorial))
/ dNumPointsFactorial;
}
}
}
return coefficients;
}
/**
* @return The trained network.
*/
@Override
public MLMethod getMethod() {
return this.network;
}
/**
* Perform one iteration.
*/
@Override
public void iteration() {
LUDecomposition decomposition = null;
preIteration();
this.weights = NetworkCODEC.networkToArray(this.network);
double sumOfSquaredErrors = jacobianByFiniteDifference();
// this.setError(j.getError());
calculateHessian();
// Define the objective function
// bayesian regularization objective function
final double objective = this.beta * sumOfSquaredErrors;
double current = objective + 1.0;
// Start the main Levenberg-Macquardt method
this.lambda /= LevenbergMarquardtTraining.SCALE_LAMBDA;
// We'll try to find a direction with less error
// (or where the objective function is smaller)
while ((current >= objective)
&& (this.lambda < LevenbergMarquardtTraining.LAMBDA_MAX)) {
this.lambda *= LevenbergMarquardtTraining.SCALE_LAMBDA;
// Update diagonal (Levenberg-Marquardt formula)
for (int i = 0; i < this.parametersLength; i++) {
this.hessian[i][i] = this.diagonal[i] + this.lambda;
}
// Decompose to solve the linear system
decomposition = new LUDecomposition(this.hessianMatrix);
// Check if the Jacobian has become non-invertible
if (!decomposition.isNonsingular()) {
continue;
}
// Solve using LU (or SVD) decomposition
this.deltas = decomposition.Solve(this.gradient);
// Update weights using the calculated deltas
updateWeights();
// Calculate the new error
sumOfSquaredErrors = 0.0;
for (int i = 0; i < this.trainingLength; i++) {
this.indexableTraining.getRecord(i, this.pair);
final MLData actual = this.network
.compute(this.pair.getInput());
final double e = this.pair.getIdeal().getData(0)
- actual.getData(0);
sumOfSquaredErrors += e * e;
}
sumOfSquaredErrors /= 2.0;
// Update the objective function
current = this.beta * sumOfSquaredErrors;
// If the object function is bigger than before, the method
// is tried again using a greater dumping factor.
}
// If this iteration caused a error drop, then next iteration
// will use a smaller damping factor.
this.lambda /= LevenbergMarquardtTraining.SCALE_LAMBDA;
setError(sumOfSquaredErrors);
postIteration();
}
/// <summary>
/// Calculates the Jacobian Matrix using Finite Differences
/// </summary>
/// <returns>Returns the sum of squared errors of the network divided by 2.</returns>
private double jacobianByFiniteDifference() {
double e;
double sumOfSquaredErrors = 0;
getTraining().getRecordCount();
int ji = 0;
// foreach training vector
for (final MLDataPair pair : getTraining()) {
final MLData networkOutput = this.network.compute(pair.getInput());
// Calculate network error to build the residuals vector
e = pair.getIdeal().getData(0) - networkOutput.getData(0);
this.errors[ji] = e;
sumOfSquaredErrors += e * e;
// Computation of one of the Jacobian Matrix rows by nummerical differentiation:
// for each weight wj in the network, we have to compute its partial
// derivative to build the jacobian matrix.
int jj = 0;
// So, for each layer:
for (int layer = this.network.getLayerCount() - 1; layer > 0; layer--) {
// for each neuron:
for (int neuron = 0; neuron < this.network
.getLayerNeuronCount(layer); neuron++) {
// for each weight:
for (int weight = 0; weight < this.network
.getLayerTotalNeuronCount(layer - 1); weight++) {
// Compute its partial derivative
this.jacobian[ji][jj] = computeDerivative(
pair.getInput(), layer - 1, neuron, weight,
this.derivativeStepSize,
networkOutput.getData(0), jj);
jj++;
}
}
}
ji++;
}
// returns the sum of squared errors / 2
return sumOfSquaredErrors / 2.0;
}
/**
* {@inheritDoc}
*/
@Override
public TrainingContinuation pause() {
return null;
}
/**
* {@inheritDoc}
*/
@Override
public void resume(final TrainingContinuation state) {
}
/**
* Update the weights.
*
* @return The sum squared of the weights.
*/
public void updateWeights() {
final double[] w = this.weights.clone();
for (int i = 0; i < w.length; i++) {
w[i] += this.deltas[i];
}
NetworkCODEC.arrayToNetwork(w, this.network);
}
}