/*
* Javlov - a Java toolkit for reinforcement learning with multi-agent support.
*
* Copyright (c) 2009 Matthijs Snel
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package net.javlov.policy;
import java.util.List;
import net.javlov.Action;
import net.javlov.Actor;
import net.javlov.Option;
import net.javlov.Policy;
import net.javlov.QFunction;
import net.javlov.State;
/**
* Uses a softmax distribution over action probabilities to select an action. Thus, this
* actor can only be used with discrete actions.
*
* @author Matthijs Snel
*
*/
public class SoftmaxActor implements Actor {
/**
* The Q-function that will be used to store action selection probabilities.
*/
protected QFunction q;
/**
* The last selected action; stored in order to be able to update it with the TD error.
*/
protected Option lastOption;
/**
* Policy used to select actions from probabilities.
*/
protected Policy pi;
/**
* Learning rate alpha.
*/
protected double alpha = 0.1;
/**
* Constructs an actor based on the provided Q-function and action pool. Note that
* the {@code QFunction} is here purely used as a storage medium; it will not store
* Q-values, but instead the probabilities of selecting each action.
*
* The values stored
* in the Q-function do not necessarily need to sum to 1 since this actor uses a softmax
* distribution over the values, which guarantees that the resulting probabilities sum
* to 1 anyway. It is however recommended (if a tabular Q-function is used) to initialise
* the values to > 0, e.g. all 1 / (nr of actions).
*
* @param q the Q-function that stores the action probabilities.
* @param actions the pool of available actions.
*/
public SoftmaxActor(QFunction q, List<? extends Action> actions) {
this.q = q;
pi = new SoftmaxPolicy(q, actions);
}
/**
* @inheritDoc
*/
@Override
public <T> Option getOption(State<T> s) {
Option o = pi.getOption(s);
q.setLastOption(o);
return o;
}
/**
* @inheritDoc
*/
@Override
public double getLearnRate() {
return alpha;
}
/**
* @inheritDoc
*/
@Override
public void init() {
q.init();
}
/**
* @inheritDoc
*/
@Override
public void reset() {
q.reset();
}
/**
* @inheritDoc
*/
@Override
public void setLearnRate(double alpha) {
this.alpha = alpha;
}
/**
* Adds the provided TD error, multiplied by the learning rate alpha,
* to the current probablity of the action that was selected last.
* The probabilities of selecting the other actions will be decreased
* such that the sum of all probabilities adds to 1.
*
* This implementation uses the TD error directly to increase the probability of the
* last selected action, i.e.
*
* {@code p(s,a) = p(s,a) + alpha*TDerr},
*
* or if eligibility traces are used (simply pass the actor a "traced" q-function):
*
* {@code p(s,a) = p(s,a) + alpha*TDerr*e(s,a).}
*
* @param TDerr the TD error that will be used to update the probability of the last
* selected action.
*/
@Override
public <T> void update(double TDerr) {
q.update(TDerr);
}
@Override
public <T> Option getOption(State<T> s, double[] qvalues) {
throw new UnsupportedOperationException();
}
@Override
public <T> double[] getOptionProbabilities(State<T> s, double[] qvalues) {
return pi.getOptionProbabilities(s, qvalues);
}
}