强化学习笔记
Preliminary
-
Robbins-Monro Algorithm
Robbins-Monro Algorithm is designed to solve the following equation:
where
is a distribution of parameterized by .We can use following rule to obtain
Q-learning Algorithm uses Robbins-Monro to update Q function, i.e.,
The original equation of
and isThe sum of
can be considered as expectation. If we take and into the expectation, the above equation follows Robbin-Monro Algorithm.
Policy Gradient
Now, we parameterize the policy distribution with parameter
Let reward function
Lemma:
If
, for the equation , we have
Using this lemma, we have
Let
AC
We can using a network to estimate Q function and the actor network to learn
from collections import deque
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)
import gym
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--update_interval', type=int, default=5)
parser.add_argument('--actor_lr', type=float, default=0.0005)
parser.add_argument('--critic_lr', type=float, default=0.001)
args = parser.parse_args()
class Actor:
def __init__(self, state_dim, action_dim):
self.state_dim = state_dim
self.action_dim = action_dim
self.opt = Adam(args.actor_lr)
self.model = self.create_model()
def create_model(self):
return tf.keras.Sequential([
Input((self.state_dim, )),
Dense(32, activation='relu'),
Dense(16, activation='relu'),
Dense(self.action_dim, activation='softmax')
])
def compute_loss(self, actions, logits, advantages):
ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
actions = tf.cast(actions, tf.int32)
policy_loss = ce_loss(actions, logits, sample_weight=tf.stop_gradient(advantages))
return policy_loss
def train(self, states, actions, advantages):
with tf.GradientTape() as tape:
logits = self.model(states, training=True)
loss = self.compute_loss(actions, logits, advantages)
grads = tape.gradient(loss, self.model.trainable_variables)
self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
return loss
class Critic:
def __init__(self, state_dim, action_dim):
self.state_dim = state_dim
self.action_dim = action_dim
self.opt = Adam(args.critic_lr)
self.model = self.create_model()
def create_model(self):
return tf.keras.Sequential([
Input((self.state_dim,)),
Dense(32, activation='relu'),
Dense(16, activation='relu'),
Dense(16, activation='relu'),
Dense(1, activation='linear')
])
def compute_loss(self, v_pred, td_targets):
mse = tf.keras.losses.MeanSquaredError()
return mse(td_targets, v_pred)
def train(self, states, td_targets):
with tf.GradientTape() as tape:
v_pred = self.model(states, training=True)
assert v_pred.shape == td_targets.shape
loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
grads = tape.gradient(loss, self.model.trainable_variables)
self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
return loss
class Agent:
def __init__(self, env):
self.env = env
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.actor = Actor(self.state_dim, self.action_dim)
self.critic = Critic(self.state_dim, self.action_dim)
def td_target(self, reward, next_state, done):
if done:
return reward
v_value = self.critic.model.predict(
np.reshape(next_state, [1, self.state_dim]))
return np.reshape(reward + args.gamma * v_value[0], [1, 1])
def advantage(self, td_targets, baselines):
return td_targets - baselines
def list_to_batch(self, list):
batch = list[0]
for elem in list[1:]:
batch = np.append(batch, elem, axis=0)
return batch
def train(self, max_episodes=1000):
for ep in range(max_episodes):
state_batch = []
action_batch = []
td_target_batch = []
advatnage_batch = []
episode_reward, done = 0, False
state = self.env.reset()
while not done:
# self.env.render()
probs = self.actor.model.predict(
np.reshape(state, [1, self.state_dim]))
action = np.random.choice(self.action_dim, p=probs[0]) # choice action according to policy
next_state, reward, done, _ = self.env.step(action)
state = np.reshape(state, [1, self.state_dim])
action = np.reshape(action, [1, 1])
next_state = np.reshape(next_state, [1, self.state_dim])
reward = np.reshape(reward, [1, 1])
td_target = self.td_target(reward * 0.01, next_state, done)
advantage = self.advantage(
td_target, self.critic.model.predict(state))
state_batch.append(state)
action_batch.append(action)
td_target_batch.append(td_target)
advatnage_batch.append(advantage)
if len(state_batch) >= args.update_interval or done:
states = self.list_to_batch(state_batch)
actions = self.list_to_batch(action_batch)
td_targets = self.list_to_batch(td_target_batch)
advantages = self.list_to_batch(advatnage_batch)
actor_loss = self.actor.train(states, actions, advantages)
critic_loss = self.critic.train(states, td_targets)
state_batch = []
action_batch = []
td_target_batch = []
advatnage_batch = []
episode_reward += reward[0][0]
state = next_state[0]
print('EP{} EpisodeReward={}'.format(ep, episode_reward))
def main():
env_name = 'CartPole-v1'
env = gym.make(env_name)
agent = Agent(env)
agent.train()
if __name__ == "__main__":
main()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· C#/.NET/.NET Core技术前沿周刊 | 第 29 期(2025年3.1-3.9)
· 从HTTP原因短语缺失研究HTTP/2和HTTP/3的设计差异