RLPark 1.0.0
Reinforcement Learning Framework in Java
|
00001 package rlpark.plugin.rltoys.algorithms.predictions.td; 00002 00003 import rlpark.plugin.rltoys.algorithms.traces.ATraces; 00004 import rlpark.plugin.rltoys.algorithms.traces.Traces; 00005 import rlpark.plugin.rltoys.math.vector.DenseVector; 00006 import rlpark.plugin.rltoys.math.vector.MutableVector; 00007 import rlpark.plugin.rltoys.math.vector.RealVector; 00008 import rlpark.plugin.rltoys.math.vector.implementations.PVector; 00009 import rlpark.plugin.rltoys.math.vector.implementations.SVector; 00010 import rlpark.plugin.rltoys.math.vector.implementations.Vectors; 00011 import zephyr.plugin.core.api.internal.monitoring.wrappers.Abs; 00012 import zephyr.plugin.core.api.internal.monitoring.wrappers.Squared; 00013 import zephyr.plugin.core.api.monitoring.annotations.Monitor; 00014 00015 @Monitor 00016 @SuppressWarnings("restriction") 00017 public class TDLambdaAutostep implements OnPolicyTD { 00018 private static final long serialVersionUID = 1567652945995637498L; 00019 protected double mu = 0.01; 00020 protected double tau = 1000; 00021 @Monitor(level = 4) 00022 final private PVector v; 00023 private double v_t; 00024 @Monitor(wrappers = { Squared.ID, Abs.ID }) 00025 private double delta_t; 00026 00027 final protected Traces e; 00028 @Monitor(level = 4) 00029 final protected PVector alpha; 00030 @Monitor(level = 4) 00031 final protected PVector h; 00032 @Monitor(level = 4) 00033 final protected PVector normalizer; 00034 protected double maxOneM2; 00035 private final double gamma; 00036 private final double lambda; 00037 private double m; 00038 double tempM = 0.0; 00039 private final double lowerNumericalBound; 00040 00041 public TDLambdaAutostep(double lambda, double gamma, int nbFeatures) { 00042 this(lambda, gamma, 0.1, nbFeatures, new ATraces()); 00043 } 00044 00045 public TDLambdaAutostep(double lambda, double gamma, int nbFeatures, Traces prototype) { 00046 this(lambda, gamma, 0.1, nbFeatures, prototype); 00047 } 00048 00049 public TDLambdaAutostep(double lambda, double gamma, double initAlpha, int nbFeatures) { 00050 this(lambda, gamma, initAlpha, nbFeatures, new ATraces()); 00051 } 00052 00053 public TDLambdaAutostep(double lambda, double gamma, double initAlpha, int nbFeatures, Traces prototype) { 00054 this.lambda = lambda; 00055 e = prototype.newTraces(nbFeatures); 00056 this.gamma = gamma; 00057 v = new PVector(nbFeatures); 00058 alpha = new PVector(nbFeatures); 00059 alpha.set(initAlpha); 00060 h = new PVector(nbFeatures); 00061 normalizer = new PVector(nbFeatures); 00062 normalizer.set(1.0); 00063 lowerNumericalBound = Math.pow(10.0, -10) / nbFeatures; 00064 } 00065 00066 public void setMu(double mu) { 00067 this.mu = mu; 00068 } 00069 00070 public void setTau(double tau) { 00071 this.tau = tau; 00072 } 00073 00074 protected double initEpisode() { 00075 e.clear(); 00076 return 0; 00077 } 00078 00079 @Override 00080 public double update(RealVector x_t, RealVector x_tp1, double r_tp1) { 00081 if (x_t == null) 00082 return initEpisode(); 00083 v_t = v.dotProduct(x_t); 00084 delta_t = r_tp1 + gamma * v.dotProduct(x_tp1) - v_t; 00085 e.update(lambda * gamma, x_t); 00086 PVector densePhi = new PVector(x_t.accessData()); 00087 if (e.vect() instanceof SVector) 00088 updateNormalizationAndStepSizeSparse(delta_t, densePhi.data); 00089 else if (e.vect() instanceof DenseVector) 00090 updateNormalizationAndStepSizeDense(delta_t, densePhi.data); 00091 else 00092 throw new RuntimeException("Not implemented"); 00093 MutableVector eAlpha = e.vect().ebeMultiply(alpha); 00094 MutableVector alphaDeltaE = eAlpha.mapMultiply(delta_t); 00095 v.addToSelf(alphaDeltaE); 00096 h.addToSelf(alphaDeltaE.subtractToSelf(Vectors.absToSelf(eAlpha.ebeMultiplyToSelf(densePhi)).ebeMultiplyToSelf(h))); 00097 return delta_t; 00098 } 00099 00100 private void updateNormalizationAndStepSizeDense(double delta_t, double[] densePhi) { 00101 final double[] normalizerData = normalizer.data; 00102 final double[] alphaData = alpha.data; 00103 final double[] data = ((DenseVector) e.vect()).accessData(); 00104 for (int i = 0; i < data.length; i++) 00105 updateStepSizeNormalizers(densePhi, normalizerData, alphaData, i, data[i], delta_t); 00106 m = 0.0; 00107 for (int i = 0; i < data.length; i++) 00108 m += featureNorm(densePhi, alphaData, i, data[i]); 00109 maxOneM2 = Math.max(1, m); 00110 for (int index = 0; index < data.length; index++) 00111 if (densePhi[index] != 0) 00112 alphaData[index] /= maxOneM2; 00113 } 00114 00115 private void updateNormalizationAndStepSizeSparse(double delta_t, double[] densePhi) { 00116 final double[] normalizerData = normalizer.data; 00117 final double[] alphaData = alpha.data; 00118 final SVector se = (SVector) e.vect(); 00119 for (int i = 0; i < se.nonZeroElements(); i++) 00120 updateStepSizeNormalizers(densePhi, normalizerData, alphaData, se.activeIndexes[i], se.values[i], delta_t); 00121 m = 0.0; 00122 for (int i = 0; i < se.nonZeroElements(); i++) 00123 m += featureNorm(densePhi, alphaData, se.activeIndexes[i], se.values[i]); 00124 maxOneM2 = Math.max(1, m); 00125 for (int index : se.activeIndexes) 00126 if (densePhi[index] != 0) 00127 alphaData[index] /= maxOneM2; 00128 } 00129 00130 private void updateStepSizeNormalizers(double[] densePhi, final double[] normalizerData, final double[] alphaData, 00131 int eIndex, double eValue, final double delta_t) { 00132 double absDeltaEH = computeAbsDeltaEH(eIndex, eValue, delta_t); 00133 normalizerData[eIndex] = Math.max(absDeltaEH, 00134 normalizerData[eIndex] + (featureNorm(densePhi, alphaData, eIndex, eValue) / tau) 00135 * (absDeltaEH - normalizerData[eIndex])); 00136 normalizerData[eIndex] = Math.max(lowerNumericalBound, normalizerData[eIndex]); 00137 alphaData[eIndex] = alphaData[eIndex] * Math.exp(mu * delta_t * eValue * h.data[eIndex] / normalizerData[eIndex]); 00138 alphaData[eIndex] = Math.max(lowerNumericalBound, alphaData[eIndex]); 00139 } 00140 00141 private double featureNorm(double[] densePhi, final double[] alphaData, int index, double value) { 00142 return alphaData[index] * Math.abs(value * densePhi[index]); 00143 } 00144 00145 private double computeAbsDeltaEH(int index, double traceValue, double delta) { 00146 return Math.abs(traceValue * h.data[index] * delta); 00147 } 00148 00149 public Traces eligibility() { 00150 return e; 00151 } 00152 00153 @Override 00154 public double predict(RealVector phi) { 00155 return v.dotProduct(phi); 00156 } 00157 00158 @Override 00159 public PVector weights() { 00160 return v; 00161 } 00162 00163 @Override 00164 public void resetWeight(int index) { 00165 v.data[index] = 0; 00166 alpha.data[index] = .1; 00167 h.data[index] = 0; 00168 normalizer.data[index] = 0; 00169 e.vect().setEntry(index, 0); 00170 } 00171 00172 @Override 00173 public double error() { 00174 return delta_t; 00175 } 00176 00177 @Override 00178 public double prediction() { 00179 return v_t; 00180 } 00181 00182 public double gamma() { 00183 return gamma; 00184 } 00185 }