强化学习 单臂摆(CartPole) (DQN, Reinforce,Actor-Critic, DDPG, PPO, SAC)Pytorch
单臂摆是强化学习的一个经典模型,本文采用了4种不同的算法来解决这个问题,使用Pytorch实现。
以下是老版本,2022年9月14日新增Dueling DQN, Actor-Critic算法, SAC,更新了PPO,DDPG算法,在文末。
DQN:
参考:
算法思想:
https://mofanpy.com/tutorials/machine-learning/torch/DQN/
算法实现
https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
个人理解:DQN算法将Q学习和神经网络算法结合,解决了状态空间连续的问题。由于Q学习是off-policy的,所以需要target网络,即需要一个滞后版本的神经网络,防止一些并非最优的动作被采样之后,该动作的reward增加,之后就一直选择该非最优动作,从而影响学习的效率。由于神经网络的输入和Target要求独立同分布,所以采用ReplayBuffer和随机采样来解决这个问题。DQN的神经网络目标是让Q值预测的更准,所以loss是target和eval的完全平方差。
代码:
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import gym import random import numpy as np from collections import namedtuple GAMMA = 0.99 lr = 0.1 EPSION = 0.1 buffer_size = 10000 # replay池的大小 batch_size = 32 num_episode = 100000 target_update = 10 # 每过多少个episode将net的参数复制到target_net # 定义神经网络 class Net(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(Net, self).__init__() self.Linear1 = nn.Linear(input_size, hidden_size) self.Linear2 = nn.Linear(hidden_size, hidden_size) self.Linear3 = nn.Linear(hidden_size, output_size) def forward(self, x): # print('x: ', x) x = F.relu(self.Linear1(x)) x = F.relu(self.Linear2(x)) x = self.Linear3(x) return x # nametuple容器 Transition = namedtuple('Transition', ('state', 'action', 'reward', 'done', 'next_state')) class ReplayMemory(object): def __init__(self, capacity): self.capacity = capacity self.memory = [] self.position = 0 def push(self, *args): if len(self.memory) < self.capacity: self.memory.append(None) self.memory[self.position] = Transition(*args) self.position = (self.position + 1) % self.capacity def sample(self, batch_size): # 采样 return random.sample(self.memory, batch_size) def __len__(self): return len(self.memory) class DQN(object): def __init__(self, input_size, hidden_size, output_size): self.net = Net(input_size, hidden_size, output_size) self.target_net = Net(input_size, hidden_size, output_size) self.optim = optim.Adam(self.net.parameters(), lr=lr) self.target_net.load_state_dict(self.net.state_dict()) self.buffer = ReplayMemory(buffer_size) self.loss_func = nn.MSELoss() self.steps_done = 0 def put(self, s0, a0, r, t, s1): self.buffer.push(s0, a0, r, t, s1) def select_action(self, state): eps_threshold = random.random() action = self.net(torch.Tensor(state)) if eps_threshold > EPSION: choice = torch.argmax(action).numpy() else: choice = np.random.randint(0, action.shape[ 0]) # 随机[0, action.shape[0]]之间的数 return choice def update_parameters(self): if self.buffer.__len__() < batch_size: return samples = self.buffer.sample(batch_size) batch = Transition(*zip(*samples)) # 将tuple转化为numpy tmp = np.vstack(batch.action) # 转化成Tensor state_batch = torch.Tensor(batch.state) action_batch = torch.LongTensor(tmp.astype(int)) reward_batch = torch.Tensor(batch.reward) done_batch = torch.Tensor(batch.done) next_state_batch = torch.Tensor(batch.next_state) q_next = torch.max(self.target_net(next_state_batch).detach(), dim=1, keepdim=True)[0] q_eval = self.net(state_batch).gather(1, action_batch) q_tar = reward_batch.unsqueeze(1) + (1-done_batch) * GAMMA * q_next loss = self.loss_func(q_eval, q_tar) # print(loss) self.optim.zero_grad() loss.backward() self.optim.step() if __name__ == '__main__': env = gym.make('CartPole-v0') # 状态空间:4维 # 动作空间:1维,并且是离散的,只有0和1两个动作 Agent = DQN(env.observation_space.shape[0], 256, env.action_space.n) average_reward = 0 # 目前所有的episode的reward的平均值 for i_episode in range(num_episode): s0 = env.reset() tot_reward = 0 # 每个episode的总reward tot_time = 0 # 实际每轮运行的时间 (reward的定义可能不一样) while True: env.render() a0 = Agent.select_action(s0) s1, r, done, _ = env.step(a0) tot_time += r # 计算当前episode的总时间 # 网上定义的reward方法 # x, x_dot, theta, theta_dot = s1 # r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 # r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 # r = r1 + r2 tot_reward += r # 计算当前episode的总reward if done: t = 1 else: t = 0 Agent.put(s0, a0, r, t, s1) # 放入replay池 s0 = s1 Agent.update_parameters() if done: average_reward = average_reward + 1 / (i_episode + 1) * ( tot_reward - average_reward) print('Episode ', i_episode, 'tot_time: ', tot_time, ' tot_reward: ', tot_reward, ' average_reward: ', average_reward) break if i_episode % target_update == 0: Agent.target_net.load_state_dict(Agent.net.state_dict())
有一个点需要注意,网上有些DQN的实现没有考虑终止状态,所以需要修改Reward才能达到好的效果。在考虑终止状态后,使用原始的reward就可以学习。
Reinforce:
参考:
思路及代码:
https://blog.csdn.net/qq_37266917/article/details/109855244
个人理解:
Reinforce是一种策略梯度算法,对参数化的策略梯度算法进行梯度上升。需要注意网络不能太复杂,不然会过拟合导致很难学习。通过策略梯度定理,我们知道了怎么进行梯度上升。概率前面的回报可以看成梯度上升的幅度,即回报越大提升的概率也越多。所以在Policy Gradient中引入的基线(baseline),以防止某些非最优的动作被选择之后概率变得过大(虽然在样本足够多的时候这个问题也能解决)
神经网络的loss是 t时刻的回报 * t时刻的动作的概率取对数。以及要取负,因为神经网络是梯度下降,最小化loss,取负数就是最大化回报。
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import gym import random import numpy as np from torch.distributions import Categorical from collections import deque from collections import namedtuple GAMMA = 1.0 lr = 0.1 EPSION = 0.9 buffer_size = 10000 batch_size = 32 num_episode = 100000 target_update = 10 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 class Policy(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(Policy, self).__init__() self.Linear1 = nn.Linear(input_size, hidden_size) self.Linear1.weight.data.normal_(0, 0.1) # self.Linear2 = nn.Linear(hidden_size, hidden_size) # self.Linear2.weight.data.normal_(0, 0.1) self.Linear3 = nn.Linear(hidden_size, output_size) self.Linear3.weight.data.normal_(0, 0.1) def forward(self, x): x = F.relu(self.Linear1(x)) # x = F.relu(self.Linear2(x)) x = F.softmax(self.Linear3(x), dim=1) return x # x = F.relu(self.fc1(x)) # x = self.fc2(x) # return F.softmax(x, dim=1) class Reinforce(object): def __init__(self, input_size, hidden_size, output_size): self.net = Policy(input_size, hidden_size, output_size) self.optim = optim.Adam(self.net.parameters(), lr=0.01) def select_action(self, s): s = torch.Tensor(s).unsqueeze(0) probs = self.net(s) tmp = Categorical(probs) a = tmp.sample() log_prob = tmp.log_prob(a) return a.item(), log_prob def update_parameters(self, rewards, log_probs): R = 0 loss = 0 # for i in reversed(range(len(rewards))): # R = rewards[i] + GAMMA * R for i in reversed(range(len(rewards))): R = rewards[i] + GAMMA * R loss = loss - R * log_probs[i] # discounts = [GAMMA ** i for i in range(len(rewards) + 1)] # R = sum([a * b for a, b in zip(discounts, rewards)]) # policy_loss = [] # for log_prob in log_probs: # policy_loss.append(-log_prob * R) # loss = torch.cat(policy_loss).sum() # print('loss: ', len(loss)) # loss = loss / len(loss) self.optim.zero_grad() loss.backward() self.optim.step() if __name__ == '__main__': env = gym.make('CartPole-v0') average_reward = 0 Agent = Reinforce(env.observation_space.shape[0], 16, env.action_space.n) # scores_deque = deque(maxlen=100) # scores = [] for i_episode in range(1, num_episode + 1): s = env.reset() log_probs = [] rewards = [] while True: env.render() a, prob = Agent.select_action(s) s1, r, done, _ = env.step(a) # scores_deque.append(sum(rewards)) # scores.append(sum(rewards)) log_probs.append(prob) rewards.append(r) s = s1 if done: average_reward = average_reward + (1 / (i_episode + 1)) * (np.sum(rewards) - average_reward) if i_episode % 100 == 0: # print('Episode {}\t Average Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) print('episode: ', i_episode, "tot_rewards: ", np.sum(rewards), 'average_rewards: ', average_reward) break Agent.update_parameters(rewards, log_probs)
DDPG:
参考:
思路:https://www.cnblogs.com/pinard/p/10345762.html
实现:https://zhuanlan.zhihu.com/p/99406809
个人理解:DDPG算法采用了Actor-Critic框架,像是DQN和Policy Gradient的结合。在DDPG中,Actor输出的是一个具体的动作,而不是动作的概率分布,Critic输出的是动作的Q值。Actor和Critic都需要一个Tareget网络,需要ReplayBuffer打破相关性。网上我没找到用DDPG和Pytorch解决单臂杆问题的代码,所以我的解决方法可能不是最好的。因为单臂杆的动作是离散的2个(0,1),最开始我给Actor设置了2个输出并用argmax决定是哪个。后面发现argmax没有梯度,于是我将输出改为了一个,并套了一层sigmoid,输出小于0.5当0算,大于0.5当1算。Critic的loss和DQN类似,都是target和eval的完全平方差。Actor的loss需要自己先输出a,再用critic得到估值,求平均值,再取负。
2022年9月14注:该代码忘记添加噪声,所以有些时候学不到最优策略,已经添加了epsilon-贪心修复该问题。
代码:
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import gym import random import numpy as np from collections import namedtuple import math GAMMA = 0.9 lr = 0.1 EPSION = 0.9 buffer_size = 10000 batch_size = 32 num_episode = 100000 target_update = 10 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 tau = 0.02 #定义神经网络 class Actor(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(Actor, self).__init__() self.Linear1 = nn.Linear(input_size, hidden_size) self.Linear1.weight.data.normal_(0, 0.1) # self.Linear2 = nn.Linear(hidden_size, hidden_size) # self.Linear2.weight.data.normal_(0, 0.1) self.Linear3 = nn.Linear(hidden_size, output_size) self.Linear3.weight.data.normal_(0, 0.1) def forward(self, x): # print('x: ', x) x = F.relu(self.Linear1(x)) # x = F.relu(self.Linear2(x)) x = torch.sigmoid(self.Linear3(x)) return x class Critic(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(Critic, self).__init__() self.Linear1 = nn.Linear(input_size, hidden_size) self.Linear1.weight.data.normal_(0, 0.1) # self.Linear2 = nn.Linear(hidden_size, hidden_size) # self.Linear2.weight.data.normal_(0, 0.1) self.Linear3 = nn.Linear(hidden_size, output_size) self.Linear3.weight.data.normal_(0, 0.1) def forward(self, s, a): # print('s1: ', s) # print('a1: ', a) x = torch.cat([s, a], dim=1) # print('x: ', x) x = F.relu(self.Linear1(x)) # x = F.relu(self.Linear2(x)) x = self.Linear3(x) return x #nametuple容器 Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done')) class ReplayMemory(object): def __init__(self, capacity): self.capacity = capacity self.memory = [] self.position = 0 def push(self, *args): if len(self.memory) < self.capacity: self.memory.append(None) self.memory[self.position] = Transition(*args) self.position = (self.position + 1) % self.capacity def sample(self, batch_size):#采样 return random.sample(self.memory, batch_size) def __len__(self): return len(self.memory) class DDPG(object): def __init__(self, input_size, action_shape, hidden_size, output_size): self.actor = Actor(input_size, hidden_size, action_shape) self.actor_target = Actor(input_size, hidden_size, action_shape) self.critic = Critic(input_size + action_shape, hidden_size, action_shape) self.critic_target = Critic(input_size + action_shape, hidden_size, action_shape) self.actor_optim = optim.Adam(self.actor.parameters(), lr=0.01) self.critic_optim = optim.Adam(self.critic.parameters(), lr=0.01) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) self.buffer = ReplayMemory(buffer_size) self.loss_func = nn.MSELoss() self.steps_done = 0 def put(self, s0, a0, r, s1, done): self.buffer.push(s0, a0, r, s1, done) def select_action(self, state): state = torch.Tensor(state) a = self.actor(state) if np.random.random() < 0.1: return torch.tensor([np.random.randint(2)]) # print(a) else: return a def update_parameters(self): if self.buffer.__len__() < batch_size: return samples = self.buffer.sample(batch_size) batch = Transition(*zip(*samples)) # print(batch.action) #将tuple转化为numpy # tmp = np.vstack(batch.action) # print(tmp) #转化成Tensor state_batch = torch.Tensor(batch.state) action_batch = torch.Tensor(batch.action).unsqueeze(0).view(-1, 1) reward_batch = torch.Tensor(batch.reward) next_state_batch = torch.Tensor(batch.next_state) done_batch = torch.Tensor(batch.done) #critic更新 next_action_batch = self.actor_target(next_state_batch).unsqueeze(0).detach().view(-1, 1) r_eval = self.critic(state_batch, action_batch) r_target = reward_batch + GAMMA * self.critic_target(next_state_batch, next_action_batch).detach().view(1, -1) * done_batch r_eval = torch.squeeze(r_eval) r_target = torch.squeeze(r_target) loss = self.loss_func(r_eval, r_target) self.critic_optim.zero_grad() loss.backward() self.critic_optim.step() #actor更新 a = self.actor(state_batch).unsqueeze(0).view(-1, 1) # print('a: ', a) loss = -torch.mean(self.critic(state_batch, a)) self.actor_optim.zero_grad() loss.backward() # print('a: ', a) self.actor_optim.step() #soft update def soft_update(net_target, net): for target_param, param in zip(net_target.parameters(), net.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) soft_update(self.actor_target, self.actor) soft_update(self.critic_target, self.critic) if __name__ == '__main__': env = gym.make('CartPole-v0') Agent = DDPG(env.observation_space.shape[0], 1, 16, env.action_space.n) average_reward = 0 for i_episode in range(num_episode): s0 = env.reset() tot_reward = 0 tot_time = 0 while True: # env.render() a0 = Agent.select_action(s0) s1, r, done, _ = env.step(round(a0.detach().numpy()[0])) tot_time += r tot_reward += r Agent.put(s0, a0, r, s1, 1 - done) #结束状态很重要,不然会很难学习。 s0 = s1 Agent.update_parameters() if done: average_reward = average_reward + 1 / (i_episode + 1) * (tot_time - average_reward) if i_episode % 20 == 0: print('Episode ', i_episode, 'tot_time: ', tot_time, ' tot_reward: ', tot_reward, ' average_reward: ', average_reward) break # if i_episode % target_update == 0: # Agent.target_net.load_state_dict(Agent.net.state_dict())
PPO:
参考:
PPO算法流程及思想:
https://blog.csdn.net/qq_30615903/article/details/86308045
https://www.jianshu.com/p/9f113adc0c50
PPO算法的实现:
https://blog.csdn.net/weixin_42165585/article/details/112362125
个人理解:
PPO算法也是Actor-Critic架构,但是与DDPG不同,PPO为on-policy算法,所以不需要设计target网络,也不需要ReplayBuffer, 并且Actor和Critic的网络参数可以共享以便加快学习。PPO引入了重要度采样,使得每个episode的数据可以被多训练几次(实际的情况中,采样可能非常耗时)从而节省时间,clip保证的更新的幅度不会太大。
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from collections import namedtuple import random import gym import math lr = 0.0005 Capacity = 10000 num_epidose = 10000 Gamma = 0.98 lmbda = 0.95 eps_clip = 0.1 class Net(nn.Module): def __init__(self, input_size,hidden_size, output_size): super(Net, self).__init__() self.Linear1 = nn.Linear(input_size, hidden_size) # self.Linear2 = nn.Linear(hidden_size, hidden_size) self.Linear_actor = nn.Linear(hidden_size, output_size) self.Linear_critic = nn.Linear(hidden_size, 1) def actor_forward(self, s, dim): s = F.relu(self.Linear1(s)) prob = F.softmax(self.Linear_actor(s), dim=dim) # print(prob) return prob def critic_forward(self, s): s = F.relu(self.Linear1(s)) # s = F.relu(self.Linear2(s)) return self.Linear_critic(s) Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'rate', 'done')) class ReplayBuffer(object): def __init__(self, capacity): self.capacity = capacity self.memory = [] self.position = 0 def push(self, *args): if len(self.memory) < self.capacity: self.memory.append(None) self.memory[self.position] = Transition(*args) self.position = (self.position + 1) % self.capacity def sample(self, batch_size):#采样 return random.sample(self.memory, batch_size) def __len__(self): return len(self.memory) def clean(self): self.position = 0 self.memory = [] class PPO(object): def __init__(self, input_size, hidden_size, output_size): super(PPO, self).__init__() self.net = Net(input_size, hidden_size, output_size) self.optim = optim.Adam(self.net.parameters(), lr=lr) self.buffer = ReplayBuffer(capacity=Capacity) def act(self, s, dim): s = torch.Tensor(s) prob = self.net.actor_forward(s, dim) return prob def critic(self, s): return self.net.critic_forward(s) def put(self, s0, a0, r, s1, rate, done): self.buffer.push(s0, a0, r, s1, rate, done) def make_batch(self): batch = self.buffer.memory samples = self.buffer.memory batch = Transition(*zip(*samples)) state_batch = torch.Tensor(batch.state).view(-1, 1) action_batch = torch.LongTensor(batch.action).view(-1, 1) reward_batch = torch.Tensor(batch.reward).view(-1, 1) next_state_batch = torch.Tensor(batch.next_state) rate_batch = torch.Tensor(batch.rate).view(-1, 1) done_batch = torch.LongTensor(batch.done).view(-1, 1) return state_batch, action_batch, reward_batch, next_state_batch, done_batch, rate_batch def update_parameters(self): samples = self.buffer.memory batch = Transition(*zip(*samples)) batch = self.buffer.memory samples = self.buffer.memory batch = Transition(*zip(*samples)) state_batch = torch.Tensor(batch.state) action_batch = torch.LongTensor(batch.action).view(-1, 1) reward_batch = torch.Tensor(batch.reward).view(-1, 1) next_state_batch = torch.Tensor(batch.next_state) rate_batch = torch.Tensor(batch.rate).view(-1, 1) done_batch = torch.LongTensor(batch.done).view(-1, 1) for i in range(3): td_target = reward_batch + Gamma * self.critic(next_state_batch) * done_batch delta = td_target - self.critic(state_batch) delta = delta.detach().numpy() advantage_list = [] advantage = 0.0 for delta_t in delta[::-1]: advantage = Gamma * advantage + delta_t advantage_list.append(advantage) advantage_list.reverse() advantage = torch.Tensor(advantage_list) prob = self.act(state_batch, 1).squeeze(0) prob_a = prob.gather(1, action_batch.view(-1, 1)) ratio = torch.exp(torch.log(prob_a) - torch.log(rate_batch)) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantage loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.critic(state_batch), td_target.detach()) self.optim.zero_grad() loss.mean().backward() self.optim.step() if __name__ == '__main__': env = gym.make('CartPole-v0') Agent = PPO(env.observation_space.shape[0], 256, env.action_space.n) average_reward = 0 for i_episode in range(num_epidose): s0 = env.reset() tot_reward = 0 while True: env.render() prob = Agent.act(torch.from_numpy(s0).float(), 0) a0 = int(prob.multinomial(1)) s1, r, done, _ = env.step(a0) rate = prob[a0].item() Agent.put(s0, a0, r, s1, rate, 1 - done) s0 = s1 tot_reward += r if done: average_reward = average_reward + 1 / (i_episode + 1) * ( tot_reward - average_reward) if i_episode % 20 == 0: print('Episode ', i_episode, ' tot_reward: ', tot_reward, ' average_reward: ', average_reward) break # Agent.train_net() Agent.update_parameters() Agent.buffer.clean()
2022年9月14日更新:
重新更新PPO和DDPG算法的代码,添加了Dueling DQN和Actor-Critic的代码,参考了《动手学强化学习》这本书。
DQN,Double DQN和Dueling DQN代码改动很少,只记录Dueling DQN代码
Dueling DQN:
import gym import torch import torch.nn as nn import torch.nn.functional as F import collections import random import numpy as np import torch.optim as optim class Q_Net(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Q_Net, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, action_dim) self.Linear3 = nn.Linear(hidden_dim, 1) def forward(self, states): out = F.relu(self.Linear1(states)) advantages = self.Linear2(out) values = self.Linear3(out) if len(advantages.shape) == 2: return advantages - values - torch.mean(advantages, dim=1).view(-1, 1) else: return advantages - values - torch.mean(advantages) class DQN: def __init__(self, state_dim, hidden_dim, action_dim, eps, gamma): super(DQN, self).__init__() self.net = Q_Net(state_dim, hidden_dim, action_dim) self.target_net = Q_Net(state_dim, hidden_dim, action_dim) self.loss_func = nn.MSELoss() self.eps = eps self.action_dim = action_dim self.gamma = gamma self.optimizer = optim.Adam(params=self.net.parameters(), lr=2e-3) self.count = 0 def take_action(self, state): if np.random.random() < self.eps: return np.random.randint(self.action_dim) else: value = self.net(state) return torch.argmax(value).item() def update(self, data): state, action, reward, next_state, done = zip(*data) states = torch.tensor(state, dtype=torch.float) actions = torch.tensor(action, dtype=torch.long).view(-1, 1) rewards = torch.tensor(reward, dtype=torch.float) next_states = torch.tensor(next_state, dtype=torch.float) dones = torch.tensor(done, dtype=torch.long) next_values = self.target_net(next_states).max(dim=1)[0] targets = rewards + (self.gamma * next_values) * (1 - dones) values = self.net(states).gather(1, actions).squeeze() loss = torch.mean(F.mse_loss(values, targets)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.count % 10 == 0: self.target_net.load_state_dict(self.net.state_dict()) self.count += 1 class ReplayBuffer: def __init__(self, capacity): self.buffer = collections.deque(maxlen=capacity) def add(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): data = random.sample(self.buffer, batch_size) return data def size(self): return len(self.buffer) lr = 2e-3 num_episodes = 5000 hidden_dim = 128 gamma = 0.98 epsilon = 0.1 target_update = 10 buffer_size = 10000 minimal_size = 500 batch_size = 64 env_name = 'CartPole-v0' env = gym.make(env_name) random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) replay_buffer = ReplayBuffer(buffer_size) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = DQN(state_dim, hidden_dim, action_dim, epsilon, gamma) for i in range(num_episodes): state = env.reset() done = 0 G = 0 while not done: action = agent.take_action(torch.tensor(state, dtype=torch.float)) print(action) next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, next_state, done) state = next_state if replay_buffer.size() > minimal_size: data = replay_buffer.sample(batch_size) agent.update(data) G += reward if i % 10 == 0: print(G)
Actor Critic:
建立好两种网络,按照公式构造损失函数就行了。
import gym import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import rl_utils import random import numpy as np class Critic(nn.Module): def __init__(self, state_dim, hidden_dim): super(Critic, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, 1) def forward(self, states): out = F.relu(self.Linear1(states)) out = self.Linear2(out) return out class Actor(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Actor, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, action_dim) def forward(self, states): out = F.relu(self.Linear1(states)) out = F.softmax(self.Linear2(out), dim=1) return out class ActorCritic: def __init__(self, state_dim, hidden_dim, action_dim, gamma): self.actor = Actor(state_dim, hidden_dim, action_dim) self.critic = Critic(state_dim, hidden_dim) self.action_dim = action_dim self.gamma = gamma self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=2e-3) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=2e-3) def update(self, data): states = torch.tensor(data['states'], dtype=torch.float) actions = torch.tensor(data['actions'], dtype=torch.long).view(-1, 1) next_states = torch.tensor(data['next_states'], dtype=torch.float) rewards = torch.tensor(data['rewards'], dtype=torch.float) dones = torch.tensor(data['done'], dtype=torch.long) td_target = rewards + (self.gamma * self.critic(next_states).squeeze()) * (1 - dones) td_delta = td_target - self.critic(states).squeeze() log_probs = torch.log(self.actor(states).gather(1, actions)).squeeze() actor_loss = torch.mean(td_delta.detach() * -log_probs) critic_loss = torch.mean(F.mse_loss(self.critic(states).squeeze(), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step() def take_action(self, state): state = torch.tensor([state], dtype=torch.float) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) return action_dist.sample().item() lr = 2e-3 num_episodes = 5000 hidden_dim = 128 gamma = 0.98 env_name = 'CartPole-v0' env = gym.make(env_name) random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = ActorCritic(state_dim, hidden_dim, action_dim, gamma) for i in range(num_episodes): data = { 'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'done': [], } done = 0 state = env.reset() G = 0 while not done: action = agent.take_action(state) next_state, reward, done, _ = env.step(action) data['states'].append(state) data['actions'].append(action) data['next_states'].append(next_state) data['rewards'].append(reward) data['done'].append(done) state = next_state G += reward agent.update(data) if i % 10 == 0: print(G)
PPO:
其实就是在Actor-Critic上做了改进,通过clip操作限制新旧分布差异不要太大。因为是on-policy算法,所以每次采样到的数据要训练多轮以提高数据利用率。
import gym import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import rl_utils import random import numpy as np class Critic(nn.Module): def __init__(self, state_dim, hidden_dim): super(Critic, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, 1) def forward(self, states): out = F.relu(self.Linear1(states)) out = self.Linear2(out) return out class Actor(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Actor, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, action_dim) def forward(self, states): out = F.relu(self.Linear1(states)) out = F.softmax(self.Linear2(out), dim=1) return out class PPO: def __init__(self, state_dim, hidden_dim, action_dim, gamma, lamda, epochs, eps): self.actor = Actor(state_dim, hidden_dim, action_dim) self.critic = Critic(state_dim, hidden_dim) self.action_dim = action_dim self.gamma = gamma self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=2e-3) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=2e-3) self.epochs = epochs self.eps = eps self.lamda= lamda def update(self, data): states = torch.tensor(data['states'], dtype=torch.float) actions = torch.tensor(data['actions'], dtype=torch.long).view(-1, 1) next_states = torch.tensor(data['next_states'], dtype=torch.float) rewards = torch.tensor(data['rewards'], dtype=torch.float) dones = torch.tensor(data['done'], dtype=torch.long) old_log_probs = torch.log(self.actor(states).gather(1, actions)).squeeze().detach() td_target = rewards + (self.gamma * self.critic(next_states).squeeze()) * (1 - dones) td_delta = td_target - self.critic(states).squeeze() advantages = self.cal_advantage(self.gamma, self.lamda, td_delta) for i in range(self.epochs): log_probs = torch.log(self.actor(states).gather(1, actions)).squeeze() ratio = torch.exp(log_probs - old_log_probs) l1 = advantages * ratio l2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantages actor_loss = -torch.mean(torch.min(l1, l2)) critic_loss = torch.mean(F.mse_loss(self.critic(states).squeeze(), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step() def cal_advantage(self, gamma, lamda, td_delta): td_delta = td_delta.detach().numpy() advantage_list = [] advantage = 0.0 for delta in td_delta[::-1]: advantage = gamma * lamda * advantage + delta advantage_list.append(advantage) advantage_list.reverse() return torch.tensor(advantage_list, dtype=torch.float) def take_action(self, state): state = torch.tensor([state], dtype=torch.float) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) return action_dist.sample().item() lr = 2e-3 num_episodes = 5000 hidden_dim = 128 gamma = 0.98 env_name = 'CartPole-v0' env = gym.make(env_name) random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n epochs = 10 lamda = 0.95 eps = 0.2 agent = PPO(state_dim, hidden_dim, action_dim, gamma, lamda, epochs, eps) for i in range(num_episodes): data = { 'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'done': [], } done = 0 state = env.reset() G = 0 while not done: action = agent.take_action(state) next_state, reward, done, _ = env.step(action) data['states'].append(state) data['actions'].append(action) data['next_states'].append(next_state) data['rewards'].append(reward) data['done'].append(done) state = next_state G += reward agent.update(data) if i % 10 == 0: print(G)
SAC: 相当于把熵正则化加入了Q值的估计中,保证了一定探索性。alpha作为可学习参数,前期用于保证探索性,alpha会比较大,后期策略确定会变小。
代码:
import torch import torch.nn.functional as F import torch.nn as nn import numpy as np import torch.optim as optim import gym import random import collections class Actor(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Actor, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, action_dim) def forward(self, states): ret = F.relu(self.Linear1(states)) return F.softmax(self.Linear2(ret), dim=1) class Critic(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Critic, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, action_dim) def forward(self, states): ret = F.relu(self.Linear1(states)) return self.Linear2(ret) class SAC: def __init__(self, state_dim, hidden_dim, action_dim, target_entropy, tau, gamma): self.actor = Actor(state_dim, hidden_dim, action_dim) self.critic1 = Critic(state_dim, hidden_dim, action_dim) self.critic1_target = Critic(state_dim, hidden_dim, action_dim) self.critic1_target.load_state_dict(self.critic1.state_dict()) self.critic2 = Critic(state_dim, hidden_dim, action_dim) self.critic2_target = Critic(state_dim, hidden_dim, action_dim) self.critic2_target.load_state_dict(self.critic2.state_dict()) self.critic1_optimizer = optim.Adam(self.critic1.parameters()) self.critic2_optimizer = optim.Adam(self.critic2.parameters()) self.actor_optimizer = optim.Adam(self.actor.parameters()) self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float) self.log_alpha.requires_grad = True self.log_alpha_optimizer = torch.optim.Adam([self.log_alpha]) self.target_entropy = target_entropy self.gamma = gamma self.tau = tau def calc_td_target(self, rewards, next_states, dones): next_probs = self.actor(next_states) next_log_probs = torch.log(next_probs + 1e-8) entropy = -torch.sum(next_probs * next_log_probs, dim=1, keepdim=True) q1_value = self.critic1_target(next_states) q2_value = self.critic2_target(next_states) min_qvalue = torch.sum(next_probs * torch.min(q1_value, q2_value), dim=1, keepdim=True) next_value = min_qvalue + self.log_alpha.exp() * entropy td_target = rewards + self.gamma * next_value * (1 - dones) return td_target def update(self, data): state, action, reward, next_state, done = zip(*data) states = torch.tensor(state, dtype=torch.float) actions = torch.tensor(action, dtype=torch.long).view(-1, 1) rewards = torch.tensor(reward, dtype=torch.float).view(-1, 1) next_states = torch.tensor(next_state, dtype=torch.float) dones = torch.tensor(done, dtype=torch.long).squeeze().view(-1, 1) td_target = self.calc_td_target(rewards, next_states, dones) critic1_q_values = self.critic1(states).gather(1, actions) critic1_loss = torch.mean(F.mse_loss(critic1_q_values, td_target.detach())) critic2_q_values = self.critic2(states).gather(1, actions) critic2_loss = torch.mean(F.mse_loss(critic2_q_values, td_target.detach())) self.critic1_optimizer.zero_grad() critic1_loss.backward() self.critic1_optimizer.step() self.critic2_optimizer.zero_grad() critic2_loss.backward() self.critic2_optimizer.step() probs = self.actor(states) log_probs = torch.log(probs + 1e-8) entropy = -torch.sum(probs * log_probs, dim=1, keepdim=True) q1_value = self.critic1(states) q2_value = self.critic2(states) min_qvalue = torch.sum(probs * torch.min(q1_value, q2_value), dim=1, keepdim=True) actor_loss = torch.mean(-self.log_alpha.exp() * entropy - min_qvalue) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() alpha_loss = torch.mean((entropy - target_entropy).detach() * self.log_alpha.exp()) self.log_alpha_optimizer.zero_grad() alpha_loss.backward() self.log_alpha_optimizer.step() self.soft_update(self.critic1, self.critic1_target) self.soft_update(self.critic2, self.critic2_target) def take_action(self, state): # print(state) state = torch.tensor([state], dtype=torch.float) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) action = action_dist.sample() return action.item() def soft_update(self, net, target_net): for param_target, param in zip(target_net.parameters(), net.parameters()): param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau) class ReplayBuffer: def __init__(self, capacity): self.buffer = collections.deque(maxlen=capacity) def add(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): data = random.sample(self.buffer, batch_size) return data def size(self): return len(self.buffer) actor_lr = 1e-3 critic_lr = 1e-2 alpha_lr = 1e-2 num_episodes = 2000 hidden_dim = 128 gamma = 0.98 tau = 0.005 # 软更新参数 buffer_size = 10000 minimal_size = 500 batch_size = 128 target_entropy = -1 env_name = 'CartPole-v0' env = gym.make(env_name) random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) replay_buffer = ReplayBuffer(buffer_size) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = SAC(state_dim, hidden_dim, action_dim, target_entropy, tau, gamma) for i in range(num_episodes): state = env.reset() done = 0 G = 0 while not done: action = agent.take_action(state) next_state, reward, done, _ = env.step(action) replay_buffer.add(state, action, reward, next_state, done) state = next_state if replay_buffer.size() > minimal_size: data = replay_buffer.sample(batch_size) agent.update(data) G += reward if i % 10 == 0: print(i, " ", G)
DDPG:
老版本的做法算是一种取巧,如果有2种以上不同的动作就不行了,而且老版本忘记添加噪声(探索机制),所以有些时候会无法学习到最优解。
Actor输出的是离散动作概率分布。
新版本采用了一种叫Gumbel Softmax的技巧,使得产生的one-hot向量可以反向传播,并且one-hot向量是有噪声的。
代码:
import torch import torch.nn.functional as F import torch.nn as nn import numpy as np import torch.optim as optim import gym import random import collections def onehot_from_logits(logits, eps=0.01): ''' 生成最优动作的独热(one-hot)形式 ''' argmax_acs = (logits == logits.max(1, keepdim=True)[0]).float() # 生成随机动作, 转换成独热形式 rand_acs = torch.autograd.Variable( # 随机生成一些one-hot选择供选择 torch.eye(logits.shape[1])[ np.random.choice(range(logits.shape[1]), size=logits.shape[0]) ], requires_grad=False).to(logits.device) res = torch.stack([ argmax_acs[i] if r > eps else rand_acs[i] for i, r in enumerate(torch.rand(logits.shape[0])) ]).squeeze() return res def sample_gumbel(shape, eps=1e-20, tens_type=torch.FloatTensor): """从Gumbel(0,1)分布中采样""" U = torch.autograd.Variable(tens_type(*shape).uniform_(), requires_grad=False) return -torch.log(-torch.log(U + eps) + eps) def gumbel_softmax_sample(logits, temperature=1.0): """ 从Gumbel-Softmax分布中采样""" y = logits + sample_gumbel(logits.shape, tens_type=type(logits.data)).to( logits.device) return F.softmax(y / temperature, dim=1) def gumbel_softmax(logits, temperature=1.0): """从Gumbel-Softmax分布中采样,并进行离散化""" y = gumbel_softmax_sample(logits, temperature) y_hard = onehot_from_logits(y) y = (y_hard.to(logits.device) - y).detach() + y # 返回一个y_hard的独热量,但是它的梯度是y,我们既能够得到一个与环境交互的离散动作,又可以 # 正确地反传梯度 return y class Q_Critic(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Q_Critic, self).__init__() self.Linear1 = nn.Linear(state_dim + action_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, hidden_dim) self.Linear3 = nn.Linear(hidden_dim, 1) def forward(self, states, actions): out = F.relu(self.Linear1(torch.cat([states, actions], dim=1))) out = self.Linear3(out) return out class Actor(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Actor, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, hidden_dim) self.Linear3 = nn.Linear(hidden_dim, action_dim) def forward(self, states): out = F.relu(self.Linear1(states)) out = F.relu(self.Linear2(out)) out = F.softmax(self.Linear3(out), dim=1) return out class DDPG: def __init__(self, state_dim, hidden_dim, action_dim, gamma, tau): self.actor = Actor(state_dim, hidden_dim, action_dim) self.critic = Q_Critic(state_dim, hidden_dim, action_dim) self.target_actor = Actor(state_dim, hidden_dim, action_dim) self.target_critic = Q_Critic(state_dim, hidden_dim, action_dim) self.action_dim = action_dim self.gamma = gamma self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=2e-3) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=2e-3) self.tau = tau def update(self, data): state, action, reward, next_state, done = zip(*data) states = torch.tensor(state, dtype=torch.float) actions = torch.tensor(action, dtype=torch.float).view(-1, 2) rewards = torch.tensor(reward, dtype=torch.float) next_states = torch.tensor(next_state, dtype=torch.float) dones = torch.tensor(done, dtype=torch.long).squeeze() next_actions = gumbel_softmax(self.target_actor(next_states)) td_target = rewards + self.gamma * self.target_critic(next_states, next_actions).squeeze() * (1 - dones) critic_loss = F.mse_loss(self.critic(states, actions).squeeze(), td_target.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() now_actions = gumbel_softmax(self.actor(states)) actor_loss = -torch.mean(self.critic(states, now_actions)) actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.actor, self.target_actor) # 软更新策略网络 self.soft_update(self.critic, self.target_critic) # 软更新价值网络 def take_action(self, states): action = self.actor(states) action = gumbel_softmax(action) return action.detach() def soft_update(self, net, target_net): for param_target, param in zip(target_net.parameters(), net.parameters()): param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau) class ReplayBuffer: def __init__(self, capacity): self.buffer = collections.deque(maxlen=capacity) def add(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): data = random.sample(self.buffer, batch_size) return data def size(self): return len(self.buffer) lr = 2e-3 num_episodes = 5000 hidden_dim = 128 gamma = 0.98 epsilon = 0.1 target_update = 10 buffer_size = 10000 minimal_size = 500 batch_size = 128 tau = 0.005 env_name = 'CartPole-v0' env = gym.make(env_name) random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) replay_buffer = ReplayBuffer(buffer_size) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = DDPG(state_dim, hidden_dim, action_dim, gamma, tau) for i in range(num_episodes): state = env.reset() done = 0 G = 0 while not done: action = agent.take_action(torch.tensor([state], dtype=torch.float)) next_state, reward, done, _ = env.step(torch.argmax(action).squeeze().numpy().item()) replay_buffer.add(state, action.squeeze().detach().numpy(), reward, next_state, done) state = next_state if replay_buffer.size() > minimal_size: data = replay_buffer.sample(batch_size) agent.update(data) G += reward if i % 10 == 0: print(i, " ", G)
但实际上对于这个游戏,可以不用Gumbel Softmax函数。
回想一下DQN在解决连续动作空间问题时,可以离散化选择几种固定动作。那么对于离散的动作空间,可以类比一下。DDPG算法中,actor得到的是确定的动作,但是对于离散动作空间来说,得到Q值最大的动作argmax操作不可导。解决这个问题,一种方法是用Gumel Softmax函数获得一种可导的one-hot向量,得到确定的动作。另一种方法是,虽然动作不可导,但是动作的概率分布是可导的,也就是让actor得到的是Q值最大的动作概率分布而不是动作(或者说让动作的概率分布替换动作在DDPG中的作用),概率分布转化为动作这一步在于环境交互前完成,就不参与梯度传播了(虽然不知道这样做还算不算是DDPG算法)。
(注:2种方法学到较好策略的epoch数目比老版本都要多(也就是新版本没老版本效率高),可能是sigmod比较容易学习吧(原因未知))
(注:法2某些时候可能要很多轮才能学习到较好策略,原因未知)
代码:
import torch import torch.nn.functional as F import torch.nn as nn import numpy as np import torch.optim as optim import gym import random import collections class Q_Critic(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Q_Critic, self).__init__() self.Linear1 = nn.Linear(state_dim + action_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, hidden_dim) self.Linear3 = nn.Linear(hidden_dim, 1) def forward(self, states, actions): out = F.relu(self.Linear1(torch.cat([states, actions], dim=1))) out = self.Linear3(out) return out class Actor(nn.Module): def __init__(self, state_dim, hidden_dim, action_dim): super(Actor, self).__init__() self.Linear1 = nn.Linear(state_dim, hidden_dim) self.Linear2 = nn.Linear(hidden_dim, hidden_dim) self.Linear3 = nn.Linear(hidden_dim, action_dim) def forward(self, states): out = F.relu(self.Linear1(states)) out = F.relu(self.Linear2(out)) out = F.softmax(self.Linear3(out), dim=1) return out class DDPG: def __init__(self, state_dim, hidden_dim, action_dim, gamma, tau): self.actor = Actor(state_dim, hidden_dim, action_dim) self.critic = Q_Critic(state_dim, hidden_dim, action_dim) self.target_actor = Actor(state_dim, hidden_dim, action_dim) self.target_critic = Q_Critic(state_dim, hidden_dim, action_dim) self.action_dim = action_dim self.gamma = gamma self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=2e-3) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=2e-3) self.tau = tau def update(self, data): state, action, reward, next_state, done = zip(*data) states = torch.tensor(state, dtype=torch.float) actions = torch.tensor(action, dtype=torch.float).view(-1, 2) rewards = torch.tensor(reward, dtype=torch.float) next_states = torch.tensor(next_state, dtype=torch.float) dones = torch.tensor(done, dtype=torch.long).squeeze() next_actions = self.target_actor(next_states) td_target = rewards + self.gamma * self.target_critic(next_states, next_actions).squeeze() * (1 - dones) critic_loss = F.mse_loss(self.critic(states, actions).squeeze(), td_target.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() now_actions = self.actor(states) actor_loss = -torch.mean(self.critic(states, now_actions)) actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.actor, self.target_actor) # 软更新策略网络 self.soft_update(self.critic, self.target_critic) # 软更新价值网络 def take_action(self, states): # 使用epsilon贪心也可以,核心就是获取动作时要有探索 # if np.random.random() < 0.1: # samp = np.random.randint(2) # if samp == 0: # return torch.tensor([samp]), torch.tensor([1, 0]) # else: # return torch.tensor([samp]), torch.tensor([0, 1]) # else: # action = self.actor(states) # return torch.argmax(action, dim=1), action action = self.actor(states) action_dist = torch.distributions.Categorical(action) samp = action_dist.sample().detach() return samp, action.detach() def soft_update(self, net, target_net): for param_target, param in zip(target_net.parameters(), net.parameters()): param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau) class ReplayBuffer: def __init__(self, capacity): self.buffer = collections.deque(maxlen=capacity) def add(self, state, action, reward, next_state, done): self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): data = random.sample(self.buffer, batch_size) return data def size(self): return len(self.buffer) lr = 2e-3 num_episodes = 5000 hidden_dim = 128 gamma = 0.98 epsilon = 0.1 target_update = 10 buffer_size = 10000 minimal_size = 500 batch_size = 128 tau = 0.005 env_name = 'CartPole-v0' env = gym.make(env_name) random.seed(0) np.random.seed(0) env.seed(0) torch.manual_seed(0) replay_buffer = ReplayBuffer(buffer_size) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = DDPG(state_dim, hidden_dim, action_dim, gamma, tau) for i in range(num_episodes): state = env.reset() done = 0 G = 0 while not done: samp, action = agent.take_action(torch.tensor([state], dtype=torch.float)) next_state, reward, done, _ = env.step(samp.numpy().item()) replay_buffer.add(state, action.squeeze().detach().numpy(), reward, next_state, done) state = next_state if replay_buffer.size() > minimal_size: data = replay_buffer.sample(batch_size) agent.update(data) G += reward if i % 10 == 0: print(i, " ", G)