强化学习代码实战-07 ERINFORCEMENT 算法

  • 基于策略的学习方法:直接显示地学习一个目标策略

  • 策略梯度基于策略的方法基础

  • 基于策略的学习方法:寻找最优策略并最大化这个策略在环境的期望回报

  • 让策略更多地采样到带来较高Q值的动作

import random
import gym
import torch
import numpy as np
from matplotlib import pyplot as plt
from IPython import display

env = gym.make("CartPole-v0")
# 智能体状态
state = env.reset()
# 动作空间
actions = env.action_space.n
print(state, actions)


# 定义模型
model = torch.nn.Sequential(
    torch.nn.Linear(4, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128,2),
    torch.nn.Softmax(dim=1),)

# 得到一个动作
def get_action(state):
    state = torch.FloatTensor(state).reshape(1, 4)
    prob = model(state)
    # 根据概率选择一个动作
    action = random.choices(range(2), weights=prob[0].tolist(), k=1)[0]
    return action

# 得到一局游戏的数据
def get_data():
    states = []
    actions = []
    rewards = []
    
    state = env.reset()
    done = False
    while not done:
        # 获得一个动作
        action = get_action(state)
        # 执行动作,得到反馈
        next_state, reward, done, _ = env.step(action)
        # 存储数据
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        
        state = next_state
        
    return states, actions, rewards

def test():
    state = env.reset()
    rewards_sum = 0
    done = False
    
    while not done:
        action = get_action(state)
        state, reward, done, _ = env.step(action)    # 这里的错误排除了一天
        rewards_sum += reward
    return rewards_sum

def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    # 玩N局游戏,每局游戏训练一次(在线学习算法)
    for epoch in range(1000):
        states, actions, rewards = get_data()
        optimizer.zero_grad()
        # 计算奖励和
        reward_sum = 0
        
        # 计算各个时刻回报,从最后一步算起
        for i in reversed(range(len(states))):
            # 奖励的和,从最后一步算起。每往前一步,“和”就衰减0.02,再加上当前步骤的奖励
            reward_sum *= 0.98
            reward_sum += rewards[i]
            # 重新计算该状态下动作的概率(由于前期采样,状态对应的动作已知,模型就要学习这个得分)
            state = torch.FloatTensor(states[i]).reshape(1, 4)
            prob = model(state)
            prob = prob[0, actions[i]]
            
            # 互熵损失
            loss = -prob.log() * reward_sum
            # 累积梯度(参数为True,数值不清除)
            loss.backward(retain_graph=True)
        optimizer.step()
        
        if epoch % 100 == 0:
            test_result = sum([test() for _ in range(50)]) / 50
            print(epoch, test_result)

 

posted @ 2022-11-15 18:43  今夜无风  阅读(47)  评论(0编辑  收藏  举报