RLPark 1.0.0
Reinforcement Learning Framework in Java
|
00001 package rlpark.plugin.rltoys.horde.demons; 00002 00003 import rlpark.plugin.rltoys.algorithms.LinearLearner; 00004 import rlpark.plugin.rltoys.algorithms.functions.Predictor; 00005 import rlpark.plugin.rltoys.algorithms.predictions.td.GTDLambda; 00006 import rlpark.plugin.rltoys.algorithms.predictions.td.GVF; 00007 import rlpark.plugin.rltoys.envio.actions.Action; 00008 import rlpark.plugin.rltoys.envio.policy.Policy; 00009 import rlpark.plugin.rltoys.horde.functions.ConstantGamma; 00010 import rlpark.plugin.rltoys.horde.functions.ConstantOutcomeFunction; 00011 import rlpark.plugin.rltoys.horde.functions.GammaFunction; 00012 import rlpark.plugin.rltoys.horde.functions.OutcomeFunction; 00013 import rlpark.plugin.rltoys.horde.functions.RewardFunction; 00014 import rlpark.plugin.rltoys.math.vector.RealVector; 00015 import zephyr.plugin.core.api.labels.Labeled; 00016 import zephyr.plugin.core.api.labels.Labels; 00017 import zephyr.plugin.core.api.monitoring.annotations.Monitor; 00018 00019 public class PredictionOffPolicyDemon implements Demon, Labeled { 00020 private static final long serialVersionUID = 2103050204892958885L; 00021 private final RewardFunction rewardFunction; 00022 @Monitor 00023 private final GVF gtd; 00024 @Monitor 00025 protected final Policy target; 00026 protected final Policy behaviour; 00027 @Monitor 00028 private double rho_t; 00029 private final OutcomeFunction outcomeFunction; 00030 private final GammaFunction gammaFunction; 00031 00032 public PredictionOffPolicyDemon(Policy target, Policy behaviour, GTDLambda gtd, RewardFunction rewardFunction) { 00033 this(target, behaviour, gtd, rewardFunction, new ConstantGamma(gtd.gamma()), new ConstantOutcomeFunction(0)); 00034 } 00035 00036 public PredictionOffPolicyDemon(Policy target, Policy behaviour, GVF gtd, RewardFunction rewardFunction, 00037 GammaFunction gammaFunction, OutcomeFunction outcomeFunction) { 00038 this.rewardFunction = rewardFunction; 00039 this.gammaFunction = gammaFunction; 00040 this.outcomeFunction = outcomeFunction; 00041 this.gtd = gtd; 00042 this.target = target; 00043 this.behaviour = behaviour; 00044 } 00045 00046 @Override 00047 public void update(RealVector x_t, Action a_t, RealVector x_tp1) { 00048 rho_t = a_t != null ? target.pi(a_t) / behaviour.pi(a_t) : 0; 00049 gtd.update(1, 1, x_t, x_tp1, rewardFunction.reward(), gammaFunction.gamma(), outcomeFunction.outcome()); 00050 } 00051 00052 public RewardFunction rewardFunction() { 00053 return rewardFunction; 00054 } 00055 00056 public Predictor predicter() { 00057 return gtd; 00058 } 00059 00060 public Policy targetPolicy() { 00061 return target; 00062 } 00063 00064 @Override 00065 public LinearLearner learner() { 00066 return gtd; 00067 } 00068 00069 @Override 00070 public String label() { 00071 return "offpolicyDemon" + Labels.label(rewardFunction); 00072 } 00073 }