RLPark 1.0.0
Reinforcement Learning Framework in Java
|
00001 package rlpark.plugin.rltoys.experiments.parametersweep.offpolicy.evaluation; 00002 00003 import rlpark.plugin.rltoys.agents.offpolicy.OffPolicyAgentEvaluable; 00004 import rlpark.plugin.rltoys.agents.representations.RepresentationFactory; 00005 import rlpark.plugin.rltoys.envio.rl.RLAgent; 00006 import rlpark.plugin.rltoys.experiments.helpers.Runner; 00007 import rlpark.plugin.rltoys.experiments.helpers.Runner.RunnerEvent; 00008 import rlpark.plugin.rltoys.experiments.parametersweep.interfaces.PerformanceEvaluator; 00009 import rlpark.plugin.rltoys.experiments.parametersweep.onpolicy.internal.OnPolicyRewardMonitor; 00010 import rlpark.plugin.rltoys.experiments.parametersweep.onpolicy.internal.RewardMonitorAverage; 00011 import rlpark.plugin.rltoys.experiments.parametersweep.onpolicy.internal.RewardMonitorEpisode; 00012 import rlpark.plugin.rltoys.experiments.parametersweep.parameters.Parameters; 00013 import rlpark.plugin.rltoys.experiments.parametersweep.reinforcementlearning.OffPolicyProblemFactory; 00014 import rlpark.plugin.rltoys.experiments.parametersweep.reinforcementlearning.RLParameters; 00015 import rlpark.plugin.rltoys.problems.RLProblem; 00016 import zephyr.plugin.core.api.signals.Listener; 00017 00018 public class ContinuousOffPolicyEvaluation extends AbstractOffPolicyEvaluation { 00019 private static final long serialVersionUID = -654783411988105997L; 00020 private final int resetPeriod; 00021 00022 public ContinuousOffPolicyEvaluation(int nbRewardCheckpoint) { 00023 this(nbRewardCheckpoint, -1); 00024 } 00025 00026 public ContinuousOffPolicyEvaluation(int nbRewardCheckpoint, int resetPeriod) { 00027 super(nbRewardCheckpoint); 00028 this.resetPeriod = resetPeriod; 00029 } 00030 00031 private OnPolicyRewardMonitor createRewardMonitor(String prefix, int nbBins, int nbTimeSteps, int nbEpisode) { 00032 if (nbEpisode == 1) 00033 return new RewardMonitorAverage(prefix, nbBins, nbTimeSteps); 00034 return new RewardMonitorEpisode(prefix, nbBins, nbEpisode); 00035 } 00036 00037 @Override 00038 public PerformanceEvaluator connectEvaluator(int counter, Runner behaviourRunner, OffPolicyProblemFactory problemFactory, 00039 RepresentationFactory projectorFactory, OffPolicyAgentEvaluable learningAgent, Parameters parameters) { 00040 if (RLParameters.nbEpisode(parameters) != 1) 00041 throw new RuntimeException("This evaluation does not support multiple episode for the behaviour"); 00042 RLProblem problem = createEvaluationProblem(counter, problemFactory); 00043 RLAgent evaluatedAgent = learningAgent.createEvaluatedAgent(); 00044 int nbEpisode = resetPeriod > 0 ? RLParameters.maxEpisodeTimeSteps(parameters) / nbRewardCheckpoint : 1; 00045 int nbTimeSteps = resetPeriod > 0 ? resetPeriod : RLParameters.maxEpisodeTimeSteps(parameters); 00046 final Runner runner = new Runner(problem, evaluatedAgent, nbEpisode, resetPeriod); 00047 OnPolicyRewardMonitor monitor = createRewardMonitor("Target", nbRewardCheckpoint, nbTimeSteps, nbEpisode); 00048 monitor.connect(runner); 00049 behaviourRunner.onTimeStep.connect(new Listener<Runner.RunnerEvent>() { 00050 @Override 00051 public void listen(RunnerEvent eventInfo) { 00052 runner.step(); 00053 } 00054 }); 00055 return monitor; 00056 } 00057 }