强化学习代码实战-08 PPO算法（倒立摆）

连续性问题处理
"""
@Date   ：2022/11/2
@Fun: 倒立摆控制
"""
import random
import gym
import torch
import numpy as np
from matplotlib import pyplot as plt
from IPython import display

env = gym.make("Pendulum-v0")
# 智能体状态
state = env.reset()
# 动作空间（连续性问题）
actions = env.action_space
print(state, actions)

# 演员模型：接收一个状态，使用抽样方式确定动作
class Model(torch.nn.Module):
    """
    继承nn.Module，必须实现__init__() 方法和forward()方法。其中__init__() 方法里创建子模块，在forward()方法里拼接子模块。
    """
    def __init__(self):
        super().__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(3, 128),
                                            torch.nn.ReLU(),
                                            )
        self.fc_mu = torch.nn.Sequential(torch.nn.Linear(128, 1),
                                         torch.nn.Tanh())
        self.fc_std = torch.nn.Sequential(torch.nn.Linear(128, 1),
                                          torch.nn.Softplus())

    def forward(self, state):
        state = self.fc_state(state)
        mu = self.fc_mu(state)
        std = self.fc_std(state)

        return mu, std

# 学一个针对state的概率密度函数mu，std
actor_model = Model()
# 评论员模型：评价一个状态的价值，给出多好的得分
critic_model = torch.nn.Sequential(torch.nn.Linear(3, 128),
                                   torch.nn.ReLU(),
                                   torch.nn.Linear(128, 1))
# 演员模型执行一个动作（采样获得）
def get_action(state):
    state = torch.FloatTensor(state).reshape(1, 3)
    mu, std = actor_model(state)
    # 通过服从(mu, std)的概率密度函数得到连续性动作
    action = torch.distributions.Normal(mu, std).sample().item()

    return action

# 获取一个回合的样本数据
def get_data():
    states = []
    rewards = []
    actions = []
    next_states = []
    dones = []

    state = env.reset()
    done = False
    while not done:
        action = get_action(state)
        next_state, reward, done, _ = env.step([action])
        states.append(state)
        rewards.append(reward)
        actions.append(action)
        next_states.append(next_state)
        dones.append(done)

        state = next_state
    # 转换为tensor
    states = torch.FloatTensor(states).reshape(-1, 3)
    rewards = torch.FloatTensor(rewards).reshape(-1, 1)
    actions = torch.FloatTensor(actions).reshape(-1, 1)       # 动作连续
    next_states = torch.FloatTensor(next_states).reshape(-1, 3)
    dones = torch.LongTensor(dones).reshape(-1, 1)

    return states, actions, rewards, next_states, dones

def test():
    state = env.reset()
    reward_sum = 0
    over = False

    while not over:
        action = get_action(state)

        state, reward, over, _ = env.step([action])
        reward_sum += reward

    return reward_sum

# 优势函数
def get_advantage(deltas):
    # 算法来源：GAE，广义优势估计方法。便于计算从后往前累积优势
    advantages = []
    s = 0
    for delta in deltas[::-1]:
        s = 0.98 * 0.95 * s + delta
        advantages.append(s)
    advantages.reverse()

    return advantages

print(get_advantage([0.8, 0.9, 0.99, 1.00, 1.11, 1.12]))

def train():
    optimizer = torch.optim.Adam(actor_model.parameters(), lr=1e-5)
    optimizer_td = torch.optim.Adam(critic_model.parameters(), lr=1e-3)

    # 玩N局游戏，每局游戏玩M次
    for epoch in range(1000):
        states, actions, rewards, next_states, dones = get_data()
        rewards = (rewards + 8) / 8
        # 计算values和targets
        values = critic_model(states)
        targets = critic_model(next_states).detach()    # 目标，不作用梯度
        targets = targets * 0.98
        # 结束状态价值为零
        targets *= (1- dones)
        # 计算总回报(奖励+下一状态)
        targets += rewards

        # 计算优势，类比策略梯度中的reward_sum
        deltas = (targets - values).squeeze().tolist()  # 标量数值
        advantages = get_advantage(deltas)
        advantages = torch.FloatTensor(advantages).reshape(-1, 1)

        # 取出每一步动作演员给的评分
        mu, std = actor_model(states)
        action_dist = torch.distributions.Normal(mu, std)
        # 找到当前连续动作在分布下的概率值，exp()做还原使用，就数据补参与梯度更新
        old_probs = action_dist.log_prob(actions).exp().detach()

        # 每批数据反复训练10次
        for _ in range(10):
            # 重新计算每一步动作概率
            mu, std = actor_model(states)
            new_action_dist = torch.distributions.Normal(mu, std)
            new_probs = new_action_dist.log_prob(actions).exp()
            # 概率变化率
            ratios = new_probs / old_probs
            # 计算不clip和clip中的loss，取较小值
            no_clip_loss = ratios * advantages
            clip_loss = torch.clamp(ratios, min=0.8, max=1.2) * advantages
            loss = -torch.min(no_clip_loss, clip_loss).mean()
            # 重新计算value，并计算时序差分loss
            values = critic_model(states)
            loss_td = torch.nn.MSELoss()(values, targets)

            # 更新参数
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            optimizer_td.zero_grad()
            loss_td.backward()
            optimizer_td.step()

        if epoch % 100 == 0:
            result = sum([test() for _ in range(10)]) / 10
            print(epoch, result)


train()
posted @ 2022-11-18 21:40 今夜无风阅读(948) 评论(0) 编辑收藏举报
刷新页面返回顶部
强化学习代码实战-08 PPO算法 （倒立摆）

公告

强化学习代码实战-08 PPO算法（倒立摆）