强化学习代码实战-07 ERINFORCEMENT 算法
-
基于策略的学习方法:直接显示地学习一个目标策略
-
策略梯度基于策略的方法基础
-
基于策略的学习方法:寻找最优策略并最大化这个策略在环境的期望回报
-
让策略更多地采样到带来较高Q值的动作
import random import gym import torch import numpy as np from matplotlib import pyplot as plt from IPython import display env = gym.make("CartPole-v0") # 智能体状态 state = env.reset() # 动作空间 actions = env.action_space.n print(state, actions) # 定义模型 model = torch.nn.Sequential( torch.nn.Linear(4, 128), torch.nn.ReLU(), torch.nn.Linear(128,2), torch.nn.Softmax(dim=1),) # 得到一个动作 def get_action(state): state = torch.FloatTensor(state).reshape(1, 4) prob = model(state) # 根据概率选择一个动作 action = random.choices(range(2), weights=prob[0].tolist(), k=1)[0] return action # 得到一局游戏的数据 def get_data(): states = [] actions = [] rewards = [] state = env.reset() done = False while not done: # 获得一个动作 action = get_action(state) # 执行动作,得到反馈 next_state, reward, done, _ = env.step(action) # 存储数据 states.append(state) actions.append(action) rewards.append(reward) state = next_state return states, actions, rewards def test(): state = env.reset() rewards_sum = 0 done = False while not done: action = get_action(state) state, reward, done, _ = env.step(action) # 这里的错误排除了一天 rewards_sum += reward return rewards_sum def train(): optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # 玩N局游戏,每局游戏训练一次(在线学习算法) for epoch in range(1000): states, actions, rewards = get_data() optimizer.zero_grad() # 计算奖励和 reward_sum = 0 # 计算各个时刻回报,从最后一步算起 for i in reversed(range(len(states))): # 奖励的和,从最后一步算起。每往前一步,“和”就衰减0.02,再加上当前步骤的奖励 reward_sum *= 0.98 reward_sum += rewards[i] # 重新计算该状态下动作的概率(由于前期采样,状态对应的动作已知,模型就要学习这个得分) state = torch.FloatTensor(states[i]).reshape(1, 4) prob = model(state) prob = prob[0, actions[i]] # 互熵损失 loss = -prob.log() * reward_sum # 累积梯度(参数为True,数值不清除) loss.backward(retain_graph=True) optimizer.step() if epoch % 100 == 0: test_result = sum([test() for _ in range(50)]) / 50 print(epoch, test_result)
时刻记着自己要成为什么样的人!