1.在策略上,在均值为Mu, 标准差为std的标准正态化分布上进行采样, 使用action_bound来获得动作,使用log_prob获得该动作的概率值
dist = Normal(mu, std) normal_sample = dist.rsample() # 在标准化正态分布上采样 log_prob = dist.log_prob(normal_sample) # 计算该值的标准正太分布上的概率 action = torch.tanh(normal_sample) # 对数值进行tanh # 计算tanh_normal分布的对数概率密度 log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7) # 为了提升目标对应的概率值 action = action * self.action_bound # 对action求取范围
2.使用target_critic_1求解下一个时刻的状态和使用target_critic_2求解下一个时刻的状态, 在构造next_value时,使用了最小值的q_value, 同时使用概率分布的加权进行加和
next_value = torch.min(q1_value, q2_value) + self.log_alpha.exp() * entropy # 加上概率值的比例
3.target使用reward + self.gamma * next_value * (1 - dones)
td_target = rewards + self.gamma * next_value * (1 - dones)
4.损失函数 critic 使用td_target来与当前的critic的函数构成损失函数
critic_1_loss = torch.mean( F.mse_loss(self.critic_1(states, actions), td_target.detach())) critic_2_loss = torch.mean( F.mse_loss(self.critic_2(states, actions), td_target.detach()))
5.更新策略网络, 由当前的状态输出动作和概率值, 由critic来生成q1_value和q2_value, 当策略的熵低于目标H0时, 训练目标会使得a增加,否则会使得a减少,策略训练更加关注于价值提升
actor_loss = torch.mean(-self.log_alpha.exp() * entropy - torch.min(q1_value, q2_value))
6.目标熵的更新alpha, 用来控制entropy的大小
alpha_loss = torch.mean( (entropy - self.target_entropy).detach() * self.log_alpha.exp()) self.log_alpha_optimizer.zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step()
train.py
import random import gym import numpy as np import torch from tqdm import tqdm import matplotlib.pyplot as plt import rl_utils from model import SACContinuous env_name = "Pendulum-v0" env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high[0] # 动作最大值 random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) actor_lr = 3e-4 critic_lr = 3e-3 alpha_lr = 3e-4 num_episodes = 100 hidden_dim = 128 gamma = 0.99 tau = 0.005 # 软更新参数 buffer_size = 100000 minimal_size = 1000 batch_size = 64 target_entropy = -env.action_space.shape[0] device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") replay_buffer = rl_utils.ReplayBuffer(buffer_size) agent = SACContinuous(state_dim, hidden_dim, action_dim, action_bound, actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma, device) return_list = rl_utils.train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size) episodes_list = list(range(len(return_list))) plt.plot(episodes_list, return_list) plt.xlabel('Episodes') plt.ylabel('Returns') plt.title('SAC on {}'.format(env_name)) plt.show() mv_return = rl_utils.moving_average(return_list, 9) plt.plot(episodes_list, mv_return) plt.xlabel('Episodes') plt.ylabel('Returns') plt.title('SAC on {}'.format(env_name)) plt.show()
model.py
import torch import torch.nn.functional as F from torch.distributions import Normal import numpy as np class PolicyNetContinuous(torch.nn.Module): def __init__(self, state_dim, hidden_dim, action_dim, action_bound): super(PolicyNetContinuous, self).__init__() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc_mu = torch.nn.Linear(hidden_dim, action_dim) self.fc_std = torch.nn.Linear(hidden_dim, action_dim) self.action_bound = action_bound def forward(self, x): x = F.relu(self.fc1(x)) mu = self.fc_mu(x) std = F.softplus(self.fc_std(x)) dist = Normal(mu, std) normal_sample = dist.rsample() # 在标准化正态分布上采样 log_prob = dist.log_prob(normal_sample) # 计算该值的标准正太分布上的概率 action = torch.tanh(normal_sample) # 对数值进行tanh # 计算tanh_normal分布的对数概率密度 log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7) # 为了提升目标对应的概率值 action = action * self.action_bound # 对action求取范围 return action, log_prob # class QValueNetContinuous(torch.nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(QValueNetContinuous, self).__init__() self.fc1 = torch.nn.Linear(state_dim + action_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim) self.fc_out = torch.nn.Linear(hidden_dim, 1) def forward(self, x, a): cat = torch.cat([x, a], dim=1) x = F.relu(self.fc1(cat)) x = F.relu(self.fc2(x)) return self.fc_out(x) class SACContinuous: """处理连续动作的SAC算法""" def __init__(self, state_dim, hidden_dim, action_dim, action_bound, actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma, device): self.actor = PolicyNetContinuous(state_dim, hidden_dim, action_dim, action_bound).to(device) self.critic_1 = QValueNetContinuous(state_dim, hidden_dim, action_dim).to(device) self.critic_2 = QValueNetContinuous(state_dim, hidden_dim, action_dim).to(device) self.target_critic_1 = QValueNetContinuous(state_dim, hidden_dim, action_dim).to(device) self.target_critic_2 = QValueNetContinuous(state_dim, hidden_dim, action_dim).to(device) # 令目标Q网络的初始参数和Q网络一样 self.target_critic_1.load_state_dict(self.critic_1.state_dict()) self.target_critic_2.load_state_dict(self.critic_2.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_1_optimizer = torch.optim.Adam(self.critic_1.parameters(), lr=critic_lr) self.critic_2_optimizer = torch.optim.Adam(self.critic_2.parameters(), lr=critic_lr) # 使用alpha的log值,可以使得训练结果比较稳定 self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) self.log_alpha.requires_grad = True # 可以对alpha求梯度 self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=alpha_lr) self.target_entropy = target_entropy # 目标熵的大小 self.gamma = gamma self.tau = tau self.device = device def take_action(self, state): state = torch.tensor([state], dtype=torch.float).to(self.device) action = self.actor(state)[0] return [action.item()] def calc_target(self, rewards, next_states, dones): next_actions, log_prob = self.actor(next_states) # 获得下一个时刻的动作和概率值 entropy = -log_prob # q1_value = self.target_critic_1(next_states, next_actions) # 获得下一个时刻的q1_value q2_value = self.target_critic_2(next_states, next_actions) next_value = torch.min(q1_value, q2_value) + self.log_alpha.exp() * entropy # 加上概率值的比例 td_target = rewards + self.gamma * next_value * (1 - dones) return td_target def soft_update(self, net, target_net): for param_target, param in zip(target_net.parameters(), net.parameters()): param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau) def update(self, transition_dict): states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1, 1).to(self.device) rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device) next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device) # 和之前章节一样, 对倒立摆环境的奖励进行重塑以便训练 rewards = (rewards + 8.0) / 8.0 # 更新两个Q网络 td_target = self.calc_target(rewards, next_states, dones) critic_1_loss = torch.mean( F.mse_loss(self.critic_1(states, actions), td_target.detach())) critic_2_loss = torch.mean( F.mse_loss(self.critic_2(states, actions), td_target.detach())) self.critic_1_optimizer.zero_grad() critic_1_loss.backward() self.critic_1_optimizer.step() self.critic_2_optimizer.zero_grad() critic_2_loss.backward() self.critic_2_optimizer.step() # 更新策略网络 new_actions, log_prob = self.actor(states) entropy = -log_prob q1_value = self.critic_1(states, new_actions) q2_value = self.critic_2(states, new_actions) actor_loss = torch.mean(-self.log_alpha.exp() * entropy - torch.min(q1_value, q2_value)) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # 更新alpha值 alpha_loss = torch.mean( (entropy - self.target_entropy).detach() * self.log_alpha.exp()) self.log_alpha_optimizer.zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.soft_update(self.critic_1, self.target_critic_1) self.soft_update(self.critic_2, self.target_critic_2)
rl_utils.py
from tqdm import tqdm import numpy as np import torch import collections import random class ReplayBuffer: def __init__(self, capacity): self.buffer = collections.deque(maxlen=capacity) def add(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): transitions = random.sample(self.buffer, batch_size) state, action, reward, next_state, done = zip(*transitions) return np.array(state), action, reward, np.array(next_state), done def size(self): return len(self.buffer) def moving_average(a, window_size): cumulative_sum = np.cumsum(np.insert(a, 0, 0)) middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size r = np.arange(1, window_size - 1, 2) begin = np.cumsum(a[:window_size - 1])[::2] / r end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1] return np.concatenate((begin, middle, end)) def train_on_policy_agent(env, agent, num_episodes): return_list = [] for i in range(10): with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: for i_episode in range(int(num_episodes / 10)): episode_return = 0 transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []} state = env.reset() done = False # 一局放入到一块数据里面 while not done: action = agent.take_action(state) next_state, reward, done, _ = env.step(action) transition_dict['states'].append(state) transition_dict['actions'].append(action) transition_dict['next_states'].append(next_state) transition_dict['rewards'].append(reward) transition_dict['dones'].append(done) state = next_state episode_return += reward return_list.append(episode_return) agent.update(transition_dict) if (i_episode + 1) % 10 == 0: pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode + 1), 'return': '%.3f' % np.mean(return_list[-10:])}) pbar.update(1) return return_list def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size): return_list = [] for i in range(10): with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar: for i_episode in range(int(num_episodes / 10)): episode_return = 0 state = env.reset() done = False while not done: action = agent.take_action(state) next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, next_state, done) state = next_state episode_return += reward if replay_buffer.size() > minimal_size: b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size) transition_dict = {'states': b_s, 'actions': b_a, 'next_states': b_ns, 'rewards': b_r, 'dones': b_d} agent.update(transition_dict) return_list.append(episode_return) if (i_episode + 1) % 10 == 0: pbar.set_postfix({'episode': '%d' % (num_episodes / 10 * i + i_episode + 1), 'return': '%.3f' % np.mean(return_list[-10:])}) pbar.update(1) return return_list def compute_advantage(gamma, lmbda, td_delta): td_delta = td_delta.detach().numpy() advantage_list = [] advantage = 0.0 for delta in td_delta[::-1]: advantage = gamma * lmbda * advantage + delta # 时分差分算法, 表示从头到尾的影响 advantage_list.append(advantage) advantage_list.reverse() # 序列反转 return torch.tensor(advantage_list, dtype=torch.float)