RLPark: PredictionOffPolicyDemon.java Source File

RLPark 1.0.0
Reinforcement Learning Framework in Java
File List
00001 package rlpark.plugin.rltoys.horde.demons;
00002 
00003 import rlpark.plugin.rltoys.algorithms.LinearLearner;
00004 import rlpark.plugin.rltoys.algorithms.functions.Predictor;
00005 import rlpark.plugin.rltoys.algorithms.predictions.td.GTDLambda;
00006 import rlpark.plugin.rltoys.algorithms.predictions.td.GVF;
00007 import rlpark.plugin.rltoys.envio.actions.Action;
00008 import rlpark.plugin.rltoys.envio.policy.Policy;
00009 import rlpark.plugin.rltoys.horde.functions.ConstantGamma;
00010 import rlpark.plugin.rltoys.horde.functions.ConstantOutcomeFunction;
00011 import rlpark.plugin.rltoys.horde.functions.GammaFunction;
00012 import rlpark.plugin.rltoys.horde.functions.OutcomeFunction;
00013 import rlpark.plugin.rltoys.horde.functions.RewardFunction;
00014 import rlpark.plugin.rltoys.math.vector.RealVector;
00015 import zephyr.plugin.core.api.labels.Labeled;
00016 import zephyr.plugin.core.api.labels.Labels;
00017 import zephyr.plugin.core.api.monitoring.annotations.Monitor;
00018 
00019 public class PredictionOffPolicyDemon implements Demon, Labeled {
00020   private static final long serialVersionUID = 2103050204892958885L;
00021   private final RewardFunction rewardFunction;
00022   @Monitor
00023   private final GVF gtd;
00024   @Monitor
00025   protected final Policy target;
00026   protected final Policy behaviour;
00027   @Monitor
00028   private double rho_t;
00029   private final OutcomeFunction outcomeFunction;
00030   private final GammaFunction gammaFunction;
00031 
00032   public PredictionOffPolicyDemon(Policy target, Policy behaviour, GTDLambda gtd, RewardFunction rewardFunction) {
00033     this(target, behaviour, gtd, rewardFunction, new ConstantGamma(gtd.gamma()), new ConstantOutcomeFunction(0));
00034   }
00035 
00036   public PredictionOffPolicyDemon(Policy target, Policy behaviour, GVF gtd, RewardFunction rewardFunction,
00037       GammaFunction gammaFunction, OutcomeFunction outcomeFunction) {
00038     this.rewardFunction = rewardFunction;
00039     this.gammaFunction = gammaFunction;
00040     this.outcomeFunction = outcomeFunction;
00041     this.gtd = gtd;
00042     this.target = target;
00043     this.behaviour = behaviour;
00044   }
00045 
00046   @Override
00047   public void update(RealVector x_t, Action a_t, RealVector x_tp1) {
00048     rho_t = a_t != null ? target.pi(a_t) / behaviour.pi(a_t) : 0;
00049     gtd.update(1, 1, x_t, x_tp1, rewardFunction.reward(), gammaFunction.gamma(), outcomeFunction.outcome());
00050   }
00051 
00052   public RewardFunction rewardFunction() {
00053     return rewardFunction;
00054   }
00055 
00056   public Predictor predicter() {
00057     return gtd;
00058   }
00059 
00060   public Policy targetPolicy() {
00061     return target;
00062   }
00063 
00064   @Override
00065   public LinearLearner learner() {
00066     return gtd;
00067   }
00068 
00069   @Override
00070   public String label() {
00071     return "offpolicyDemon" + Labels.label(rewardFunction);
00072   }
00073 }