强化学习代码实战-09 SAC 算法

离线学习算法：A3C, PPO, TRPO
在线学习算法：DDPG，训练不稳定，容易陷入局部最优
SAC: 离线策略，随机策略，最大化累积奖励的同时最大化策略的熵（更好地探索环境，熵值越高，策略越随机，目标的奖励越大）
Soft Actor-Critic 算法公式：
α越大，策略的随机性越强
算法流程：
如何计算policy的entropy？
参考：1） https://zhuanlan.zhihu.com/p/90671492；2）https://www.cnblogs.com/deeplearningcry/p/16151156.html
代码练习：
"""
@Date   ：2022/11/2
@Fun: SAC算法
"""
import random
import gym
import torch
import math
import numpy as np
from matplotlib import pyplot as plt
from IPython import display

env = gym.make("Pendulum-v0")
# 智能体状态
state = env.reset()
# 动作空间（连续性问题）
actions = env.action_space
print(state, actions)

# 演员模型：接收一个状态，使用抽样方式确定动作
class ModelAction(torch.nn.Module):
    """
    继承nn.Module，必须实现__init__() 方法和forward()方法。其中__init__() 方法里创建子模块，在forward()方法里拼接子模块。
    """
    def __init__(self):
        super().__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(3, 64),
                                            torch.nn.ReLU()
                                            )
        self.mu = torch.nn.Linear(64, 1)
        self.std = torch.nn.Sequential(torch.nn.Linear(64,1),
                                       torch.nn.Softplus())

    def forward(self, state):
        state = self.fc_state(state)
        mu = self.mu(state)
        std = self.std(state)
        # 根据均值和方差，定义batch size个正太分布
        dist = torch.distributions.Normal(mu, std)
        # 重采样,采样b个样本
        sample = dist.rsample()
        action = torch.tanh(sample)

        # 求熵
        log_prob = dist.log_prob(sample)
        entropy = log_prob - (1 - action.tanh() ** 2 + 1e-7).log()
        entropy = -entropy

        return action * 2, entropy
# 策略网络
actor_model = ModelAction()

class ModelValue(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_state = torch.nn.Sequential(torch.nn.Linear(4, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 64),
                                            torch.nn.ReLU(),
                                            torch.nn.Linear(64, 1),)

    def forward(self, state, action):
        # 接收状态和动作特征，输出价值
        state_action = torch.cat([state, action], dim=1)
        return self.fc_state(state_action)

# 评论员模型:4个价值网络模型
critic_model1 = ModelValue()
critic_model2 = ModelValue()
critic_model_next1 = ModelValue()
critic_model_next2 = ModelValue()

critic_model_next1.load_state_dict(critic_model1.state_dict())
critic_model_next2.load_state_dict(critic_model2.state_dict())

# 优化参数a,对数更稳定
alpha = torch.tensor(math.log(0.01))
alpha.requires_grad = True

# 演员模型根据状态输出确定性动作值
def get_action(state):
    state = torch.FloatTensor(state).reshape(1, 3)
    action, _ = actor_model(state)

    return action.item()

# 离线学习策略，构建Replay Buffer 样本池
datas = []
def update_data():
    state = env.reset()
    done = False

    while not done:
        action = get_action(state)
        next_state, reward, done, _ = env.step([action])
        datas.append((state, action, reward, next_state, done))
        state = next_state

    while len(datas) > 10000:
        datas.pop(0)

# 从Buffer中获取一个batch样本，迭代训练时使用
def get_samples():
    batch_size = 64
    sample = random.sample(datas, batch_size)
    states = torch.FloatTensor([i[0] for i in sample]).reshape(-1, 3)
    actions = torch.FloatTensor([i[1] for i in sample]).reshape(-1, 1)
    rewards = torch.FloatTensor([i[2] for i in sample]).reshape(-1, 1)
    next_states = torch.FloatTensor([i[3] for i in sample]).reshape(-1, 3)
    dones = torch.LongTensor([i[4] for i in sample]).reshape(-1, 1)

    return states, actions, rewards, next_states, dones


# 监督目标y的计算(时序差分)
def get_target(reward, next_state, done):
    # 计算下一状态的动作和熵
    action, entropy = actor_model(next_state)
    target1 = critic_model_next1(next_state, action)
    target2 = critic_model_next1(next_state, action)
    target = torch.min(target1, target2)
    # 加上动作熵
    target *= 0.99
    target += alpha.exp() * entropy
    target *= (1 - done)
    target += reward

    return target

# 计算策略网络（演员）的优化loss，需要借助价值网络（评论员）价值最大化特性
def get_loss_action(state):
    action, entropy = actor_model(state)
    target1 = critic_model1(state, action)
    target2 = critic_model2(state, action)
    value = torch.min(target1, target2)

    loss_action = -alpha.exp() * entropy
    loss_action -= value

    return loss_action.mean(), entropy

# 模型软更新
def soft_update(model, model_next):
    # 模型以一个微小步长参数更新
    for old, new in zip(model.parameters(), model_next.parameters()):
        value = new.data * (1 - 0.005) + old.data * 0.005
        new.data.copy_(value)

def test():
    state = env.reset()
    reward_sum = 0
    over = False

    while not over:
        action = get_action(state)

        state, reward, over, _ = env.step([action])
        reward_sum += reward

    return reward_sum

def train():

    optimizer_action = torch.optim.Adam(actor_model.parameters(), lr=5e-4)
    optimizer_value1 = torch.optim.Adam(critic_model1.parameters(), lr=5e-3)
    optimizer_value2 = torch.optim.Adam(critic_model2.parameters(), lr=5e-3)

    # 要学习的正则化参数，控制熵的重要程度
    optimizer_alpha = torch.optim.Adam([alpha], lr=5e-3)

    # 玩N局游戏，每局游戏玩M次
    for epoch in range(200):
        # 更新一波数据
        update_data()

        # 每次更新后，大约学习200次（一个回合的样本大概是200）
        for _ in range(200):

            states, actions, rewards, next_states, dones = get_samples()
            rewards = (rewards + 8) / 8

            # 计算target
            target = get_target(rewards, next_states, dones)
            target = target.detach()
            # 计算两个value
            value1 = critic_model1(states, actions)
            value2 = critic_model2(states, actions)
            # 计算两个loss，两个value的目标都要贴近target
            loss_value1 = torch.nn.MSELoss()(value1, target)
            loss_value2 = torch.nn.MSELoss()(value2, target)

            optimizer_value1.zero_grad()
            loss_value1.backward()
            optimizer_value1.step()

            optimizer_value2.zero_grad()
            loss_value2.backward()
            optimizer_value2.step()

            # 计算演员模型的loss
            loss_action, entropy = get_loss_action(states)
            optimizer_action.zero_grad()
            loss_action.backward()
            optimizer_action.step()

            loss_alpha = alpha.exp() * (entropy + 1).detach()
            loss_alpha = loss_alpha.mean()

            optimizer_alpha.zero_grad()
            loss_alpha.backward()
            optimizer_alpha.step()

            soft_update(critic_model1, critic_model_next1)
            soft_update(critic_model2, critic_model_next2)

        if epoch % 20 == 0:
            result = sum([test() for _ in range(10)]) / 10
            print(epoch, result)

train()
posted @ 2022-11-20 17:08 今夜无风阅读(1763) 评论(0) 编辑收藏举报
刷新页面返回顶部
强化学习代码实战-09 SAC 算法

公告