强化学习代码实战-03动态规划算法(冰壶游戏测试)
import os import gym import numpy as np from matplotlib import pyplot as plt env = gym.make('FrozenLake-v1', is_slippery=False, map_name='4x4', desc=['SFFF','FHFH', 'FFFH', 'HFFG']) env.reset() env = env.unwrappedimport os import gym import numpy as np from matplotlib import pyplot as plt env = gym.make('FrozenLake-v1', is_slippery=False, map_name='4x4', desc=['SFFF','FHFH', 'FFFH', 'HFFG']) env.reset() env = env.unwrapped def show(): plt.imshow(env.render(mode='rgb_array')) plt.show() print(f"状态:{len(env.P)}, 每个状态可执行动作细节:{env.P[0]}") # 初始化格子价值 values = np.zeros(16) # 初始化每个格子采取动作的概率 pi = np.ones([16,4]) * 0.25 algorithm = "价值迭代" values, pi # 计算状态-动作价值 def get_qsa(state, action): value = 0.0 for prob, next_state, reward, done in env.P[state][action]: # 计算下个状态的回报得分 next_value = values[next_state] * .9 # 下个状态如果是终点或陷阱,得分为0 if done: next_value = 0 # 该动作下一状态得分为奖励+下个状态得分 next_value += reward # 下状态是以概率出现,要乘以对应的概率 next_value *= prob # 求和-期望价值 value += next_value return value # 策略评估 def get_values(): new_values = np.zeros(16) for state in range(16): action_value = np.zeros(4) for action in range(4): # 获取每个动作的价值 action_value[action] = get_qsa(state, action) if algorithm == "策略迭代": # 结合每个动作计算期望价值 action_value *= pi[state] new_values[state] = action_value.sum() if algorithm == "价值迭代": # 取当前格子最优动作得分作为格子的价值 new_values[state] = action_value.max() return new_values # 策略提升 def get_pi(): new_pi = np.zeros([16, 4]) for state in range(16): action_value = np.zeros(4) for action in range(4): action_value[action] = get_qsa(state, action) count = (action_value == action_value.max()).sum() for action in range(4): if action_value[action] == action_value.max(): new_pi[state, action] = 1 / count else: new_pi[state, action] = 0 return new_pi for _ in range(10): for _ in range(100): values = get_values() pi = get_pi() values.reshape(4,4), pi
时刻记着自己要成为什么样的人!