RLPark 1.0.0
Reinforcement Learning Framework in Java
|
00001 package rlpark.plugin.rltoys.algorithms.control.acting; 00002 00003 import rlpark.plugin.rltoys.algorithms.functions.Predictor; 00004 import rlpark.plugin.rltoys.algorithms.functions.stateactions.StateToStateAction; 00005 import rlpark.plugin.rltoys.envio.actions.Action; 00006 import rlpark.plugin.rltoys.envio.policy.DiscreteActionPolicy; 00007 import rlpark.plugin.rltoys.envio.policy.Policy; 00008 import rlpark.plugin.rltoys.envio.policy.PolicyPrototype; 00009 import rlpark.plugin.rltoys.math.vector.RealVector; 00010 import rlpark.plugin.rltoys.utils.Utils; 00011 import zephyr.plugin.core.api.monitoring.annotations.Monitor; 00012 00013 public class Greedy implements DiscreteActionPolicy, PolicyPrototype { 00014 private static final long serialVersionUID = 1675962692054005355L; 00015 protected final StateToStateAction toStateAction; 00016 protected final Predictor predictor; 00017 protected final Action[] actions; 00018 @Monitor 00019 protected final double[] actionValues; 00020 protected Action bestAction; 00021 @Monitor 00022 private double bestValue; 00023 00024 public Greedy(Predictor predictor, Action[] actions, StateToStateAction toStateAction) { 00025 this.toStateAction = toStateAction; 00026 this.predictor = predictor; 00027 this.actions = actions; 00028 actionValues = new double[actions.length]; 00029 } 00030 00031 @Override 00032 public Action sampleAction() { 00033 return bestAction; 00034 } 00035 00036 @Override 00037 public void update(RealVector x_tp1) { 00038 updateActionValues(x_tp1); 00039 findBestAction(); 00040 } 00041 00042 private void findBestAction() { 00043 bestValue = actionValues[0]; 00044 bestAction = actions[0]; 00045 for (int i = 1; i < actions.length; i++) { 00046 double value = actionValues[i]; 00047 if (value > bestValue) { 00048 bestValue = value; 00049 bestAction = actions[i]; 00050 } 00051 } 00052 } 00053 00054 private void updateActionValues(RealVector s_tp1) { 00055 for (int i = 0; i < actions.length; i++) { 00056 RealVector phi_sa = toStateAction.stateAction(s_tp1, actions[i]); 00057 actionValues[i] = predictor.predict(phi_sa); 00058 } 00059 } 00060 00061 @Override 00062 public double pi(Action a) { 00063 return a == bestAction ? 1 : 0; 00064 } 00065 00066 public StateToStateAction toStateAction() { 00067 return toStateAction; 00068 } 00069 00070 public Action bestAction() { 00071 return bestAction; 00072 } 00073 00074 public double bestActionValue() { 00075 return bestValue; 00076 } 00077 00078 @Override 00079 public double[] values() { 00080 return actionValues; 00081 } 00082 00083 @Override 00084 public Action[] actions() { 00085 return actions; 00086 } 00087 00088 @Override 00089 public Policy duplicate() { 00090 return new Greedy(predictor, actions, Utils.clone(toStateAction)); 00091 } 00092 }