强化学习代码实战-03动态规划算法(冰壶游戏测试)

import os
import gym
import numpy as np
from matplotlib import pyplot as plt


env = gym.make('FrozenLake-v1', is_slippery=False,
              map_name='4x4', desc=['SFFF','FHFH',
                                   'FFFH', 'HFFG'])
env.reset()
env = env.unwrappedimport os
import gym
import numpy as np
from matplotlib import pyplot as plt


env = gym.make('FrozenLake-v1', is_slippery=False,
              map_name='4x4', desc=['SFFF','FHFH',
                                   'FFFH', 'HFFG'])
env.reset()
env = env.unwrapped

def show():
    plt.imshow(env.render(mode='rgb_array'))
    plt.show()
print(f"状态:{len(env.P)}, 每个状态可执行动作细节:{env.P[0]}")

# 初始化格子价值
values = np.zeros(16)
# 初始化每个格子采取动作的概率
pi = np.ones([16,4]) * 0.25
algorithm = "价值迭代"
values, pi

# 计算状态-动作价值
def get_qsa(state, action):
    value = 0.0
    for prob, next_state, reward, done in env.P[state][action]:
        # 计算下个状态的回报得分
        next_value = values[next_state] * .9
        # 下个状态如果是终点或陷阱,得分为0
        if done:
            next_value = 0
        # 该动作下一状态得分为奖励+下个状态得分
        next_value += reward
        # 下状态是以概率出现,要乘以对应的概率
        next_value *= prob
        # 求和-期望价值
        value += next_value
    return value
# 策略评估
def get_values():
    new_values = np.zeros(16)
    for state in range(16):
        action_value = np.zeros(4)
        for action in range(4):
            # 获取每个动作的价值
            action_value[action] = get_qsa(state, action)
        if algorithm == "策略迭代":
            # 结合每个动作计算期望价值
            action_value *= pi[state]
            new_values[state] = action_value.sum()
        if algorithm == "价值迭代":
            # 取当前格子最优动作得分作为格子的价值
            new_values[state] = action_value.max()
    return new_values
# 策略提升
def get_pi():
    new_pi = np.zeros([16, 4])
    for state in range(16):
        action_value = np.zeros(4)
        for action in range(4):
            action_value[action] = get_qsa(state, action)
        count = (action_value == action_value.max()).sum()
        for action in range(4):
            if action_value[action] == action_value.max():
                new_pi[state, action] = 1 / count
            else:
                new_pi[state, action] = 0
    return new_pi

for _ in range(10):
    for _ in range(100):
        values = get_values()
    pi = get_pi()
    
values.reshape(4,4), pi

 

posted @ 2022-11-10 14:54  今夜无风  阅读(284)  评论(0编辑  收藏  举报