强化学习代码实战-08 PPO算法 (倒立摆)
连续性问题处理
""" @Date :2022/11/2 @Fun: 倒立摆控制 """ import random import gym import torch import numpy as np from matplotlib import pyplot as plt from IPython import display env = gym.make("Pendulum-v0") # 智能体状态 state = env.reset() # 动作空间(连续性问题) actions = env.action_space print(state, actions) # 演员模型:接收一个状态,使用抽样方式确定动作 class Model(torch.nn.Module): """ 继承nn.Module,必须实现__init__() 方法和forward()方法。其中__init__() 方法里创建子模块,在forward()方法里拼接子模块。 """ def __init__(self): super().__init__() self.fc_state = torch.nn.Sequential(torch.nn.Linear(3, 128), torch.nn.ReLU(), ) self.fc_mu = torch.nn.Sequential(torch.nn.Linear(128, 1), torch.nn.Tanh()) self.fc_std = torch.nn.Sequential(torch.nn.Linear(128, 1), torch.nn.Softplus()) def forward(self, state): state = self.fc_state(state) mu = self.fc_mu(state) std = self.fc_std(state) return mu, std # 学一个针对state的概率密度函数mu,std actor_model = Model() # 评论员模型:评价一个状态的价值,给出多好的得分 critic_model = torch.nn.Sequential(torch.nn.Linear(3, 128), torch.nn.ReLU(), torch.nn.Linear(128, 1)) # 演员模型执行一个动作(采样获得) def get_action(state): state = torch.FloatTensor(state).reshape(1, 3) mu, std = actor_model(state) # 通过服从(mu, std)的概率密度函数得到连续性动作 action = torch.distributions.Normal(mu, std).sample().item() return action # 获取一个回合的样本数据 def get_data(): states = [] rewards = [] actions = [] next_states = [] dones = [] state = env.reset() done = False while not done: action = get_action(state) next_state, reward, done, _ = env.step([action]) states.append(state) rewards.append(reward) actions.append(action) next_states.append(next_state) dones.append(done) state = next_state # 转换为tensor states = torch.FloatTensor(states).reshape(-1, 3) rewards = torch.FloatTensor(rewards).reshape(-1, 1) actions = torch.FloatTensor(actions).reshape(-1, 1) # 动作连续 next_states = torch.FloatTensor(next_states).reshape(-1, 3) dones = torch.LongTensor(dones).reshape(-1, 1) return states, actions, rewards, next_states, dones def test(): state = env.reset() reward_sum = 0 over = False while not over: action = get_action(state) state, reward, over, _ = env.step([action]) reward_sum += reward return reward_sum # 优势函数 def get_advantage(deltas): # 算法来源:GAE,广义优势估计方法。便于计算从后往前累积优势 advantages = [] s = 0 for delta in deltas[::-1]: s = 0.98 * 0.95 * s + delta advantages.append(s) advantages.reverse() return advantages print(get_advantage([0.8, 0.9, 0.99, 1.00, 1.11, 1.12])) def train(): optimizer = torch.optim.Adam(actor_model.parameters(), lr=1e-5) optimizer_td = torch.optim.Adam(critic_model.parameters(), lr=1e-3) # 玩N局游戏,每局游戏玩M次 for epoch in range(1000): states, actions, rewards, next_states, dones = get_data() rewards = (rewards + 8) / 8 # 计算values和targets values = critic_model(states) targets = critic_model(next_states).detach() # 目标,不作用梯度 targets = targets * 0.98 # 结束状态价值为零 targets *= (1- dones) # 计算总回报(奖励+下一状态) targets += rewards # 计算优势,类比策略梯度中的reward_sum deltas = (targets - values).squeeze().tolist() # 标量数值 advantages = get_advantage(deltas) advantages = torch.FloatTensor(advantages).reshape(-1, 1) # 取出每一步动作演员给的评分 mu, std = actor_model(states) action_dist = torch.distributions.Normal(mu, std) # 找到当前连续动作在分布下的概率值,exp()做还原使用,就数据补参与梯度更新 old_probs = action_dist.log_prob(actions).exp().detach() # 每批数据反复训练10次 for _ in range(10): # 重新计算每一步动作概率 mu, std = actor_model(states) new_action_dist = torch.distributions.Normal(mu, std) new_probs = new_action_dist.log_prob(actions).exp() # 概率变化率 ratios = new_probs / old_probs # 计算不clip和clip中的loss,取较小值 no_clip_loss = ratios * advantages clip_loss = torch.clamp(ratios, min=0.8, max=1.2) * advantages loss = -torch.min(no_clip_loss, clip_loss).mean() # 重新计算value,并计算时序差分loss values = critic_model(states) loss_td = torch.nn.MSELoss()(values, targets) # 更新参数 optimizer.zero_grad() loss.backward() optimizer.step() optimizer_td.zero_grad() loss_td.backward() optimizer_td.step() if epoch % 100 == 0: result = sum([test() for _ in range(10)]) / 10 print(epoch, result) train()
时刻记着自己要成为什么样的人!