# encoding:utf-8 import numpy as np import matplotlib.pylab as plt ''' 随机行走问题 0 - 1 - 2 - 3 - 4 - 5 - 6 e s e 0终点r为0. 6终点r为1 中间每个选择r为0 策略 [-1, 1] 每种选择0.5, -1向左,1向右 这个策略下,理论上数字越大回报越高 ''' stats = range(7) start = 3 end = [0, 6] actions = [-1, 1] r = 1 # 衰减因子 alpha = 0.5 # 学习率 echos = [5, 10, 50, 100, 500, 1000, 10000] def choose_act(stat): # 策略 if np.random.rand() > 0.5: return 1 else: return -1 v = np.zeros([len(stats)]) for i in echos: for j in range(i): act = choose_act(start) stat_ = start + act if stat_ in end: if stat_ == 6: v[start] += alpha * (1 + v[stat_] - v[start]) else: v[start] += alpha * (v[stat_] - v[start]) start = np.random.randint(1,6) else: v[start] += alpha * (v[stat_] - v[start]) start = np.random.randint(1,6) plt.plot(v[1:-1]) plt.text(stats[-4], v[-3], j+1) plt.xlabel('state') plt.ylabel('v') plt.text(1, 0.8, 'alpha = %s'%alpha) plt.show()
可以看到 随着学习率的增大,效果越来越好,当学习率为0.5时,已经明显过拟合了
这个是单步的,书上是单回合的,所以不同,后续有空会更新代码
# encoding:utf-8 from __future__ import division import numpy as np import matplotlib.pylab as plt stats = range(7) end = [0, 6] actions = [-1, 1] r = 1 # 衰减因子 def choose_act(stat): # 策略 if np.random.rand() > 0.5: return 1 else: return -1 v_t = [0, 1/6, 1/3, 1/2, 2/3, 5/6, 0] alpha_td = [0.1, 0.15, 0.2] # 学习率 alpha_mc = [0.01, 0.02, 0.04] for c in range(3): # TD alpha = alpha_td[c] # v = np.random.rand(len(stats)) # v = np.zeros(len(stats)) v = [0.2] * len(stats) errors = [] start = 3 for j in range(100): act = choose_act(start) stat_ = start + act if stat_ in end: if stat_ == 6: v[start] += alpha * (1 + v[stat_] - v[start]) else: v[start] += alpha * (v[stat_] - v[start]) start = np.random.randint(1,6) else: v[start] += alpha * (v[stat_] - v[start]) start = stat_ # np.random.randint(1,6) error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v)])) errors.append(error) plt.plot(range(100), errors) index = np.random.randint(40,100) plt.text(index-3, errors[index], 'alpha_td = %s'%alpha) # MC alpha = alpha_mc[c] # v_mc = np.random.rand(len(stats)) # v_mc = np.zeros(len(stats)) v_mc = [0.2] * len(stats) count_mc = np.zeros(len(stats)) errors = [] for j in range(100): process = [] start = 3 # np.random.randint(1, 6) while True: if start in end: process.append([start]) break act = choose_act(start) if start == 5 and act == 1: r = 1 else: r = 0 process.append([start, act, r]) start = start + act T = len(process[:-1]) s_all = [i[0] for i in process[:-1]] s_dealed = [] for k in range(T): sar = process[k] s = sar[0] if s in s_dealed:continue # first visit t = s_all.index(s) # 该s 首次出现的位置 num = s_all.count(s) # 该s 总共出现的次数 r_all = sum([i[2] for i in process[t:-1]]) / num v_mc[s] += alpha * (r_all - v_mc[s]) # v_mc[s] = (v_mc[s] * count_mc[s] + r_all) / (count_mc[s] + 1) # count_mc[s] += 1 s_dealed.append(s) error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v_mc)])) errors.append(error) plt.plot(range(100), errors, '.') index = np.random.randint(40,100) plt.text(index-3, errors[index], 'alpha_mc = %s'%alpha) plt.xlabel('echo') plt.ylabel('mse') plt.show()
随机行走有个特殊性:两个终点,有一个终点奖励为0,也就是说在前几个回合中,单步更新的TD如果一开始向左走,需要好多步才能到达右边终点,而MC由于是整个回合,要么左,要么右,先到右边终点的概率要大得多,所以,前几步MC收敛明显比TD快
但是从总体来看,TD收敛比MC要快,而且收敛值要小,故TD效率更高
上述代码的问题
1.TD 是单步计算MSE,而MC是单回合计算MSE,比较的前提不同
2.在计算MSE时,只是计算了一次评估的误差,并不是平均误差
更新代码