强化学习-笔记
import gym from gym import envs env_specs = envs.registry.all() # 查看库中都注册了哪些环境 # for e in env_specs: # print(e) env = gym.make("CartPole-v1") # 取出环境平衡车-v1 env.reset() # 初始化环境对象env, 返回智能体的初始观测值:array([-0.00667078, -0.04023064, -0.01557324, 0.04904377]) # env.observation_space # 环境的观察空间:Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32) # env.action_space # 环境的动作空间:Discrete(2) # 环境初始化后即可使用,核心:使用对象的step()方法 action = env.action_space.sample() # step参数需要取自动作空间,可以使用sample从动作空间随机取一个动作;每次调用会使得环境前进一步,需要放在循环里完成整个回合。 # env.render() # 显示当前环境 # env.close() # 关闭当前环境 # 如果要得到更好的结果,不能在每一次都采取随机,还是要知道什么动作对环境做了什么会更好。要拿到step函数返回的东西,其会返回四个值: # observation(object): 一个特定环境的对象,代表智能体对环境的观察,观测值,与env.reset()意义相同 # reward(float): 前一个动作的奖励量 # done(boolean): 是否需要再次重置环境 # info(dict): 对调试有用的诊断信息 env.observation_space.high
import gym env = gym.make("CartPole-v1") for i_episode in range(20): observation = env.reset() for t in range(100): env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) print(observation, reward, done, info) if done: print(f"Episode finished after {t+1} timesteps") break env.close()
import gym env = gym.make("MountainCar-v0") print(f"观测空间={env.observation_space}") print(f"动作空间={env.action_space}") print(f"观测范围={env.observation_space.low}~{env.observation_space.high}") print(f"动作数={env.action_space.n}")
class BespokeAgent: def __init__(self, env): pass # 决策 def decide(self, observation): position, velocity = observation lb = min(-0.09 * (position + 0.25) ** 2 + 0.03, 0.3*(position + 0.9) **4 - 0.008) ub = -0.07 * (position + 0.38) **2 + 0.06 if lb < velocity < ub: action = 2 else: action = 0 return action # 学习 def learn(self, *args): pass agent = BespokeAgent(env)
def play_montecarlo(env, agent, render=True, train=False): episode_reward = 0. observation = env.reset() while True: if render: env.render() # 显示图形界面 action = agent.decide(observation) # 智能体决定采取的动作 next_observation, reward, done, _ = env.step(action) # 执行动作 episode_reward += reward # 收集回合奖励 if train: agent.learn(observation, action, reward, done) # 学习 if done: break observation = next_observation return episode_reward # 返回回合总奖励 # env.seed(0) # 执行一次 episode_reward = play_montecarlo(env, agent, train=True) print(f"回合奖励:{episode_reward}") env.close()
# 连续交互100回合奖励 import numpy as np episode_rewards = [play_montecarlo(env, agent, train=True) for _ in range(10)] print(f"平均回合奖励={np.mean(episode_rewards)}") env.close()
离散markov决策
import gym env = gym.make("CliffWalking-v0") print(f"观测空间={env.observation_space}") print(f"动作空间={env.action_space}") print(f"状态数量={env.nS} 动作数量={env.nA}") print(f"地图大小={env.shape}") import numpy as np def play_once(env, policy): total_reward = 0 state = env.reset() loc = np.unravel_index(state, env.shape) print(f"状态={state}, 位置={loc}") while True: action = np.random.choice(env.nA, p=policy[state]) # 0-4随机采样,以特定概率分布采样,即每次拿出最优动作 # 执行step,环境做出反馈 next_state, reward, done, _ = env.step(action) print(f"状态:{state} 位置:{loc} 奖励:{reward}") total_reward += reward if done: break state = next_state return total_reward actions = np.ones(env.shape, dtype=int) actions[-1, :] = 0 actions[:, -1] = 2 optimal_policy = np.eye(4)[actions.reshape(-1)] # 生成最优动作向量 total_rewards = play_once(env, optimal_policy) print(f"总奖励:{total_rewards}") def evaluate_bellman(env, policy, gamma=1.): a, b = np.eye(env.nS), np.zeros((env.nS)) # 分别表示状态转移和状态价值 for state in range(env.nS-1): for action in range(env.nA): pi = policy[state][action] # print(f"{state} {action}: pi: {pi}") # 表示每一状态下各个动作产生的概率, env.P 存储环境的动力 for p, next_state, reward, done in env.P[state][action]: a[state, next_state] -= (pi * gamma) b[state] += (pi * reward * p) # 求解线性方程组,得到状态价值 v = np.linalg.solve(a,b) print(v) q = np.zeros((env.nS, env.nA)) # 利用状态价值再求解动作价值 for state in range(env.nS-1): for action in range(env.nA): for p, next_state, reward, done, in env.P[state][action]: q[state][action] += ((reward + gamma*v[next_state])*p) return v, q policy = np.random.uniform(size=(env.nS, env.nA)) policy = policy / np.sum(policy, axis=1)[:, np.newaxis] # 外层新加一维度 state_values, action_values = evaluate_bellman(env, policy) print(f"状态价值={state_values}\n动作价值={action_values}") # 评估最优策略 optimal_state_values, optimal_action_values = evaluate_bellman(env, optimal_policy) print(f"最优状态价值={optimal_state_values}\n最优动作价值={optimal_action_values}")
###########################################################################
GridWord 实验
1. 策略价值估计
# 策略评价 def policy_value(policy, env, discount_factor=1.0, theta=0.00001): """ Func: 给定环境的动力系统和一个策略,进行策略评估 policy: 大小为|S|*|A|,表示智能体的行动策略 env: env.P 存储状态转移元组:(prob, next_state, reward, done) env.nS 环境的状态数 env.nA 智能体的动作数 discount_fator: 累积回报折扣因子 theta: 控制迭代停止条件.一旦所有状态的价值更新小于该值时,停止评价 """ V = np.zeros(env.nS) # 价值初始化=0 while True: delta = 0 # 遍历每一个状态 for s in range(env.nS): v = 0 # 查找下一个可能的动作 for a, action_prob in enumerate(policy[s]): # 对于每一个动作,查找一下个可能的状态 for prob, next_state, reward, done in env.P[s][a]: # 计算期望价值 v += action_prob * prob * (reward + discount_factor*V[next_state]) # 使用”自举“,一下状态的价值用之前策略估计 delta = max(delta, np.abs(v-V[s])) # 更新V[s] V[s] = v if delta < theta: break return np.array(V) random_policy = np.ones([env.nS, env.nA]) / env.nA v = policy_value(random_policy, env) print(f"价值函数:{v.reshape(env.shape)}")
2.策略迭代
策略迭代和价值迭代都可以求解马尔科夫控制问题。策略迭代包含两个部分:1)策略评估;2)策略迭代。价值迭代直接使用贝尔曼最优方程求解,寻找最佳价值函数,对应最佳策略
# 价值迭代 def value_iteration(env, discount_factor=1.0, theta=0.00001): """ 价值迭代方法 """ def one_step_lookahead(state, V): # 一步先行值 """ 给定一个状态,计算其对应所有的动作的价值 Args: state: 要考虑的状态 V: 作为一个估计器计算值 Returns: 长度为nA的向量,包含所有动作的期望值 """ A = np.zeros(env.nA) for a in range(env.nA): for prob, next_state, reward, done in env.P[state][a]: # 动作已经确定,不需要action_prob A[a] += prob * (reward + discount_factor * V[next_state]) return A V = np.zeros(env.nS) while True: # 更新停止条件 delta = 0 # 更新每一个状态 for s in range(env.nS): # 获得该状态下的各动作价值 A = one_step_lookahead(s, V) best_action_value = np.max(A) # 此处是取值,不是动作 delta = max(delta, np.abs(best_action_value - V[s])) # 更新价值函数 V[s] = best_action_value if delta < theta: break # 使用最优价值函数创建一个确定性策略 policy = np.zeros([env.nS, env.nA]) for s in range(env.nS): # 执行一步向前看,发现此状态的最佳动作 A = one_step_lookahead(s, V) best_action = np.argmax(A) # 总是采取最优动作 policy[s, best_action] = 1.0 return policy, V
3. 赌徒问题
def value_iteration_gambers(p_h, discount_factor=1.0, theta=0.0001): """ p_h: coin 朝上的概率 """ # 只有在100到达goal奖励为1,其余为0 rewards = np.zeros(101) rewards[100] = 1 # 加入两个虚拟状态:0 和 100 V = np.zeros(101) def one_step_lookahead(s, V, rewards): """ s: gamber的资金,资本 V: 每个状态的价值 rewards: 价值向量 """ A = np.zeros(101) stakes = range(1, min(s, 100-s)+1) # 赌注,最小为1,最大为min(s, 100-s) for a in stakes: # rewards[s+a],rewards[s-a] 即时奖励 # V[s+a],V[s-a] 下一状态的价值 # 每一赌注的期望价值,赌赢了;赌输了 A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor) return A while True: delta = 0 for s in range(1, 100): A = one_step_lookahead(s, V, rewards) # print(A, s, V),选择价值最大的动作 best_action_value = np.max(A) delta = max(delta, np.abs(best_action_value - V[s])) V[s] = best_action_value if delta < theta: break # 使用最优值构建确定策略 policy = np.zeros(100) for s in range(100): A = one_step_lookahead(s, V, rewards) best_action = np.argmax(A) policy[s] = best_action return policy, V policy, v = value_iteration_gambers(0.25) print(f"Optimized Policy:{policy}") print(f"Optimized Value Function:{v}")
MC算法
import gym import numpy as np from collections import defaultdict env = gym.make("Blackjack-v1") env.reset() def print_observation(observation): score, dealer_score, usable_ace = observation print(f"Player Score(玩家得分):{score}, Usable Ace(手上是否有ace):{usable_ace}, Dealer Score(庄家得分):{dealer_score}") def strategy(observation): score, dealer_score, usable_ace = observation # 简单策略:大于20不要得分为0, 否则就要得分为1 return 0 if score >= 20 else 1 for i_episode in range(2): observation = env.reset() for t in range(2): print_observation(observation) action = strategy(observation) print("采取的动作:{}".format(["不要","要"][action])) # 走一步,得到环境反馈 observation, reward, done, _ = env.step(action) if done: print_observation(observation) print("Game end. Reward:{}\n".format(float(reward))) def mc_prediction(policy, env, num_episodes, discount_factor=1.0): """ 蒙特卡洛算法。根据特定策略采样,计算价值函数 num_episodes: 采样的回合数 返回:状态-值 字典 """ return_sum = defaultdict(float) return_count = defaultdict(float) V = defaultdict(float) # 值函数 for i_episode in range(1, num_episodes+1): # 生成一个回合数据:(state, action, reward) episode = [] state = env.reset() for i in range(100): action = policy(state) next_state, reward, done, _ = env.step(action) episode.append((state, action, reward)) if done: break state = next_state # 在此回合中,找到所有已访问的状态放入集合 states_in_episode = set([tuple(e[0]) for e in episode]) # print(f"episode:{episode}\nstates_in_episode:{states_in_episode}\n") for state in states_in_episode: # 发现在回合中首次出现该状态的索引 first_occurence_idx = next(i for i,x in enumerate(episode) if x[0]==state) # 从首次发现开始对奖励求和 G = sum([x[2]*(discount_factor**i) for i,x in enumerate(episode[first_occurence_idx:])]) # 用所有采样的回合,计算该状态的平均回报 return_sum[state] += G return_count[state] += 1 V[state] = return_sum[state] / return_count[state] return V def sample_policy(observation): # 返回0或1 score, dealer_score, usable_ace = observation return 0 if score >= 20 else 1 V_10k = mc_prediction(sample_policy, env, int(10e4))
MC-episode greedy policies
def make_episode_greedy_policy(Q, epsilon, nA): """ 基于给定Q函数和episode创建贪婪策略 Q:状态-动作值 episode: 选择一个随机动作的概率 nA: 环境中动作的数量 """ def policy_fn(observation): # observation: 环境描述(玩家得分, 庄家得分, 是否有ace) A = np.ones(nA, dtype=float) * epsilon / nA best_action = np.argmax(Q[observation]) A[best_action] += (1.0 - epsilon) # 最大动作基于高概率 return A return policy_fn def mc_control_episode_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1): """ 蒙特卡洛控制-使用epsilon贪婪策略:找对最优的epsilon-greedy policy """ return_sum = defaultdict(float) return_count = defaultdict(float) # 动作价值函数Q: state -> (action -> action-value) Q = defaultdict(lambda: np.zeros(env.action_space.n)) # 初始化 # 生成策略 policy = make_episode_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(1, num_episodes+1): # 生成一个回合数据 episode = [] state = env.reset() for i in range(100): probs = policy(state) action = np.random.choice(np.arange(len(probs)), p=probs) # 特定概率抽取动作 next_state, reward, done, _ = env.step(action) episode.append((state, action, reward)) if done: break state = next_state # 找到采样出的状态-动作对 sa_in_episode = set([(tuple(e[0]), e[1]) for e in episode]) for state, action in sa_in_episode: sa_pair = (state, action) first_occurance_idx = next(i for i,x in enumerate(episode) if x[0]==state and x[1]==action) # 计算总奖励 G = sum([e[2] * (discount_factor**i) for i,e in enumerate(episode[first_occurance_idx:])]) # 计算回报 return_sum[sa_pair] += G return_count[sa_pair] += 1 # 动作价值Q Q[state][action] = return_sum[sa_pair] / return_count[sa_pair] # 通过改变Q就将策略更新了 return Q, policy Q, policy = mc_control_episode_greedy(env, 5000)
带有权重的离线MC控制
def create_random_policy(nA): """ 随机策略函数 返回:动作概率向量 """ A = np.ones(nA, dtype=float) / nA def policy_fn(observation): return A return policy_fn def create_greedy_policy(Q): """ 基于Q值创建贪婪策略 """ def policy_fn(state): A = np.zeros_like(Q[state], dtype=float) best_action = np.argmax(Q[state]) A[best_action] = 1.0 return A return policy_fn def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0): """ mc权重控制-离线策略 """ Q = defaultdict(lambda: np.zeros(env.action_space.n)) # 加权重要性的累积分母 C = defaultdict(lambda: np.zeros(env.action_space.n)) # 想要学习的目标策略,二维[0, 0] target_policy = create_greedy_policy(Q) for i_episode in range(1, num_episodes+1): # 创建一个回合 episode = [] state = env.reset() for i in range(100): probs = behavior_policy(state) action = np.random.choice(np.arange(len(probs)), p=probs) next_state, reward, done, _ = env.step(action) episode.append((state, action, reward)) if done: break state = next_state G, W = 0.0, 1.0 # W:重要度采样率 # 从后往前遍历数据 for t in range(len(episode))[::-1]: state, action, reward = episode[t] # 从t时候更新总奖励 G = discount_factor * G + reward # 更新权重 C[state][action] += W # 更新状态价值 Q[state][action] += (W / C[state][action]) * (G - Q[state][action]) # 如果action不是目标策略,则中断 if action != np.argmax(target_policy(state)): break W = W * 1.0/behavior_policy(state)[action] return Q, target_policy random_policy = create_random_policy(env.action_space.n) Q, policy = mc_control_importance_sampling(env ,1000, random_policy)
SARSA
from lib.envs.gridworld import GridworldEnv from lib import plotting import numpy as np import itertools from collections import defaultdict env = GridworldEnv() print(f"状态数:{env.nS}, 动作数:{env.nA}, 动力系统:状态0的转移信息:{env.P[0]}") def make_epsilon_greedy_policy(Q, epsilon, nA): """ 根据给定的Q和epsilon产生一个贪婪策略 """ def policy_fn(observation): A = np.ones(nA, dtype=float) * epsilon / nA best_action = np.argmax(Q[observation]) # 获取Q函数中当前状态下概率最大的那个动作 A[best_action] += (1 - epsilon) # 保证概率分布 return A return policy_fn def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ sarsa: on-policy TD control. 找到最优的epsilon贪婪策略 同策略:使用一种策略选取动作,也用一种策略去优化 """ Q = defaultdict(lambda :np.zeros(env.action_space.n)) # 初始Q stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # 状态追踪 policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) # 沿用的策略 for i_episode in range(num_episodes): state = env.reset() action_probs = policy(state) # print(action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # 依据分布采样 for t in itertools.count(): # 无限迭代知道满足条件 # 执行一步动作 next_state, reward, done, _ = env.step(action) # 贪婪得到下一动作 next_action_probs = policy(next_state) next_action = np.random.choice(np.arange(len(action_probs)), p=next_action_probs) # 状态跟踪 stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] += 1 # TD更新 td_target = reward + discount_factor * Q[next_state][next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break action = next_action state = next_state return Q, stats Q, stats = sarsa(env, 200)
Q-learning
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ q-learning: off-policy TD control. epsilon贪婪策略找到最优策略 异策略:目标策略、行为策略。动作探索使用epsilon贪婪,目标Q优化使用下一状态价值最大的动作(不需要知道下一动作) """ Q = defaultdict(lambda :np.zeros(env.action_space.n)) # 初始Q stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # 状态追踪 policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) # 沿用的策略 for i_episode in range(num_episodes): state = env.reset() for t in itertools.count(): # 无限迭代知道满足条件 action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) # 动作探索 # 执行一步动作 next_state, reward, done, _ = env.step(action) # 状态跟踪 stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] += 1 # TD更新 best_next_action = np.argmax(Q[next_state]) # 下一个状态的最佳动作 td_target = reward + discount_factor * Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state return Q, stats Q, stats = q_learning(env, 200)
Deep Q-learning
import gym import os import random import numpy as np import tensorflow as tf from lib import plotting env = gym.make("Breakout-v0") VALID_ACTIONS = [0, 1, 2, 3] # 实施动作,0:无,1:攻击,2:左,3:右 class StateProcessor(): """ 画面预处理,变换图像大小并转换到灰度 """ def __init__(self): # tf 图构建 with tf.variable_scope("state_processor"): self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8) self.output = tf.image.rgb_to_grayscale(self.input_state) # 单通道 self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160) # 目标:160*160 self.output = tf.image.resize_images( self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) self.output = tf.squeeze(self.output) print(self.output) def process(self, sess, state): # (210, 160, 3) -> (84, 84) return sess.run(self.output, {self.input_state: state}) sp = StateProcessor() tf.reset_default_graph() class Estimator(): """Q-价值估计网络。用于Q-Network 和 Target Network """ def __init__(self, scope="estimator", summaries_dir=None): self.scope = scope self.summary_writer = None with tf.variable_scope(scope): self._build_model() if summaries_dir: summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope)) if not os.path.exists(summary_dir): os.makedirs(summary_dir) self.summary_writer = tf.summary.FileWriter(summary_dir) def _build_model(self): self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name='X') self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name='y') # TD target value self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name='actions') # 选择一个动作 X = tf.to_float(self.X_pl) / 255.0 batch_size = tf.shape(self.X_pl)[0] # conv 层 conv1 = tf.contrib.layers.conv2d( X, 32, 8, 4, activation_fn=tf.nn.relu) conv2 = tf.contrib.layers.conv2d( conv1, 64, 4, 2, activation_fn=tf.nn.relu) conv3 = tf.contrib.layers.conv2d( conv2, 64, 3, 1, activation_fn=tf.nn.relu) flattened = tf.contrib.layers.flatten(conv3) fc1 = tf.contrib.layers.fully_connected(flattened, 512) self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS)) print("predictions:", self.predictions) # 四个帧作为一个状态,选择这个动作作为4帧数据的标签索引 gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl print("gather_indices:", gather_indices) self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices) print("action_predictions", self.action_predictions) gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices) self.losses = tf.squared_difference(self.y_pl, self.action_predictions) self.loss = tf.reduce_mean(self.losses) self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) self.summaries = tf.summary.merge([ tf.summary.scalar("loss", self.loss), tf.summary.histogram("loss_hist", self.losses), tf.summary.histogram("q_values_hist", self.predictions), tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions)) ]) def predict(self, sess, s): """ 预测动作值 """ return sess.run(self.predictions, {self.X_pl: s}) def update(self, sess, s, a, y): """ 更新模型,计算batch loss s: 输入状态[batch_size, 4, 84, 84, 1] a: 选择的动作 y: 目标动作 """ feed_dict = {self.X_pl: s, self.y_pl: y, self.actions_pl: a} summaries, global_step, _, loss = sess.run( [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss], feed_dict) if self.summary_writer: self.summary_writer.add_summary(summaries, global_step) return loss # Test tf.reset_default_graph() global_step = tf.Variable(0, name="global_step", trainable=False) e = Estimator(scope='test') sp = StateProcessor() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) observation = env.reset() print("observation:", np.shape(observation)) observation_p = sp.process(sess, observation) observation = np.stack([observation_p] * 4, axis=2) observations = np.array([observation] * 3) print(e.predict(sess, observations)) y = np.array([10.0, 10.0, 11.0]) a = np.array([1, 3, 2]) print(e.update(sess, observations, a, y)) # 模型拷贝 class ModelParametersCopier(): def __init__(self, estimator1, estimator2): e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)] e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)] e2_params = sorted(e2_params, key=lambda x: x.name) e1_params = sorted(e1_params, key=lambda x: x.name) self.update_ops = [] for e1_v, e2_v in zip(e1_params, e2_params): op = e2_v.assgin(e1_v) self.update_ops.append(op) def make(self, sess): """ 执行copy """ sess.run(self.update_ops) class make_epsilon_greedy_policy(estimator, nA): """ 给定Q-fun, epsilon, 执行贪婪策略 返回:状态-动作的概率分布 """ def policy_fn(sess, observation, epsilon): A = np.ones(nA, dtype=float) * epsilon / nA q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0] best_action = np.argmax(q_values) A[best_action] += (1.0 - epsilon) return A return policy_fn def deep_q_learning(sess, env, q_estimator, target_estimator, num_epsidoes, experiment_dir, replay_memory_size=50000 replay_memory_init_size=5000, update_target_esitmator_every=10000 discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ 异策略TD控制的Q学习 返回:EpisodeStats对象 """ # 状态转移 Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # 回放内存 replay_memory = [] # 复制模型 estimator_copy = ModelParameterCopier(q_estimator, target_estimator) # 追踪有用的状态 stats = plotting.EpisodeStats(epsiode_lengths=np.zeros(num_epsiodes), epsiode_rewards=np.zeros(num_epsidoes)) # 便于查看健康状态 current_process = %psutil.Process() checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("load model checkpoint {}".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # 得到当前时间步长 total_t = sess.run(tf.contrib.framework.get_global_step()) # epsilon 衰减时间表 epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # 贪婪策略 policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # 用初始经验填充回放内存区 print("Polulating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state]*4, axis=2) # 使用4帧数据 for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) # 预处理 next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) if len(replay_memory) == replay_memory_init_size: replay_memory.pop(0) replay_memory.append(Transition(state, action, reward, next_state, done)) # 状态更新 stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # 经验回放采样 samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # 计算Q值和目标值 q_values_next = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1) # 梯度下降更新 states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) return stats global_step = tf.Variable(0, name='global_step', trainable=False) q_estimator = Estimator(scope="q_estimator") target_estimator = Estimator(scope="target_q") state_processor = StateProcessor()
REINFORCE
""" @Date :2022/11/2 @Author :d """ import gym import collections import numpy as np import itertools import tensorflow.compat.v1 as tf tf.disable_v2_behavior() from lib import plotting env = gym.make("CliffWalking-v0") env.reset() env.render() print(env.step(0)) env.render() print(env.step(1)) env.render() class PolicyEstimator(): """:策略函数估计器 """ def __init__(self, learning_rate=0.01, scope="policy_estimator"): with tf.variable_scope(scope): self.state = tf.placeholder(tf.int32, [], "state") self.action = tf.placeholder(dtype=tf.int32, name="action") self.target = tf.placeholder(dtype=tf.float32, name="target") # 查表估计 state_one_hot = tf.one_hot(self.state, int(env.observation_space.n)) self.output_layer = tf.layers.dense(inputs=tf.expand_dims(state_one_hot, 0), units=env.action_space.n, kernel_initializer=tf.zeros_initializer, activation=None) self.action_probs = tf.squeeze(tf.nn.softmax(self.output_layer)) # 变成一维向量 self.picked_action_prob = tf.gather(self.action_probs, self.action) self.loss = -tf.math.log(self.picked_action_prob) * self.target self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.train_op = self.optimizer.minimize( self.loss, global_step=tf.train.get_global_step() ) def predict(self, state, sess=None): # 预测只接受状态 sess = sess or tf.get_default_session() return sess.run(self.action_probs, {self.state: state}) def update(self, state, target, action, sess=None): # 更新需要状态、动作、目标 sess = sess or tf.get_default_session() feed_dict = {self.state: state, self.action: action, self.target: target, } _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss p = PolicyEstimator() class ValueEstimator(): """ 价值评估器,add baseline estimator """ def __init__(self, learning_rate=0.1, scope="value_estimator"): with tf.variable_scope(scope): self.state = tf.placeholder(tf.int32, [], "state") self.target = tf.placeholder(dtype=tf.float32, name="target") # 将状态值映射到one-hot state_one_hot = tf.one_hot(self.state, int(env.observation_space.n)) self.output_layer = tf.layers.dense( inputs=tf.expand_dims(state_one_hot, 0), units=1, kernel_initializer=tf.zeros_initializer, activation=None ) self.value_estimate = tf.squeeze(self.output_layer) self.loss = tf.squared_difference(self.value_estimate, self.target) self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.train_op = self.optimizer.minimize( self.loss, global_step=tf.train.get_global_step() ) def predict(self, state, sess=None): sess = sess or tf.get_default_session() return sess.run(self.value_estimate, {self.state: state}) def update(self, state, target, sess=None): sess = sess or tf.get_default_session() feed_dict = {self.state: state, self.target: target} _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss def reinforce(env, estimator_policy, estimator_value, num_episode, discount_factor=1.0): """ 蒙特卡洛策略梯度算法-使用策略梯度优化策略估计器 """ stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episode), episode_rewards=np.zeros(num_episode)) Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) for i_episode in range(num_episode): print(f"{i_episode}:episode") state = env.reset() if i_episode % 100 == 0: print(state) episode = [] # t个时间序列 for t in itertools.count(): # 执行一步 action_probs = estimator_policy.predict(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) # 状态信息记录 episode.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done)) stats.episode_lengths[i_episode] = t stats.episode_rewards[i_episode] += reward if done: break state = next_state # 浏览当前回合并进行策略更新 for t, transition in enumerate(episode): # 当前时间步的回报 total_return = sum(discount_factor**i*e.reward for i,e in enumerate(episode[t:])) # 计算 baseline/advantage baseline_value = estimator_value.predict(transition.state) advantage = total_return - baseline_value estimator_value.update(transition.state, total_return) estimator_policy.update(transition.state, advantage, transition.action) return stats tf.reset_default_graph() global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyEstimator() value_estimator = ValueEstimator()
with tf.Session() as sess: sess.run(tf.initialize_all_variables()) stats = reinforce(env, policy_estimator, value_estimator, 10, discount_factor=1.0) plotting.plot_episode_stats(stats, smoothing_window=25)
演员-评论员算法
def actor_critic(env, estimator_policy, estimator_value, num_episode, discount_factor=1.0): """演员评价价值网络;评论员评价策略网络;TD error作为策略优化目标, TD target作为价值优化目标 """ stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episode), episode_rewards=np.zeros(num_episode)) Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) for i_episode in range(num_episode): state = env.reset() if i_episode % 100 == 0: print(f"{i_episode}:episode") print(state) episode = [] # t个时间序列 for t in itertools.count(): # 执行一步 action_probs = estimator_policy.predict(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) # 状态信息记录 episode.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done)) stats.episode_lengths[i_episode] = t stats.episode_rewards[i_episode] += reward # 构建TD误差 value_next = estimator_value.predict(next_state) td_target = reward + discount_factor * value_next td_error = td_target - estimator_value.predict(state) # 更新评论员 td_target 作为目标 estimator_value.update(state, td_target) # 更新演员 td_error 作为优势估计 estimator_policy.update(state, td_error, action) if done: break state = next_state return stats
演员-评论员算法解决连续动作问题
import gym import collections import numpy as np import itertools import sklearn from sklearn import preprocessing, pipeline from lib import plotting from sklearn.kernel_approximation import RBFSampler import tensorflow.compat.v1 as tf tf.disable_v2_behavior() env = gym.make("MountainCarContinuous-v0") print(env.observation_space.sample()) # 采样状态空间中的一些例子,标准化操作 observation_examples = np.array([env.observation_space.sample() for _ in range(1000)]) scaler = preprocessing.StandardScaler() scaler.fit(observation_examples) # 转换状态到特征 featurizer = pipeline.FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=100)), ('rbf2', RBFSampler(gamma=2.0, n_components=100)), ('rbf3', RBFSampler(gamma=1.0, n_components=100)), ('rbf4', RBFSampler(gamma=0.5, n_components=100)), ]) featurizer.fit(scaler.transform(observation_examples)) def featurize_state(state): """ 返回一个状态的特征表征,生成400dim向量 """ scaled = scaler.transform([state]) featurized = featurizer.transform(scaled) return featurized[0] # featurize_state(env.observation_space.sample()) class PolicyEstimator(): """ 策略估计模型 """ def __init__(self, learning_rate=0.01, scope="policy_estimator"): with tf.variable_scope(scope): self.state = tf.placeholder(tf.float32, [400], "state") self.target = tf.placeholder(dtype=tf.float32, name="target") self.mu = tf.layers.dense(inputs=tf.expand_dims(self.state, 0), units=1) self.mu = tf.squeeze(self.mu) self.sigma = tf.layers.dense(inputs=tf.expand_dims(self.state, 0), units=1) self.sigma = tf.squeeze(self.sigma) self.sigma = tf.nn.softplus(self.sigma) + 1e-5 self.normal_dist = tf.distributions.Normal(self.mu, self.sigma) self.action = self.normal_dist._sample_n(1) self.action = tf.clip_by_value(self.action, env.observation_space.low[0], env.observation_space.high[0]) # loss and train self.loss = -self.normal_dist.log_prob(self.action) * self.target self.loss -= 1e-1 * self.normal_dist.entropy() self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.train_op = self.optimizer.minimize( self.loss, global_step=tf.train.get_global_step() ) def predict(self, state, sess=None): sess = sess or tf.get_default_session() state = featurize_state(state) return sess.run(self.action, {self.state: state}) def update(self, state, target, action, sess=None): sess = sess or tf.get_default_session() state = featurize_state(state) feed_dict = {self.state: state, self.target: target, self.action: action} _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss class ValueEstimator(): """ J价值函数估计 """ def __init__(self, learning_rate=0.1, scope="value_estimator"): with tf.variable_scope(scope): self.state = tf.placeholder(tf.float32, [400], "state") self.target = tf.placeholder(dtype=tf.float32, name="target") self.output_layer = tf.layers.dense( inputs=tf.expand_dims(self.state, 0), units=1 ) self.value_estimator = tf.squeeze(self.output_layer) self.loss = tf.squared_difference(self.value_estimator, self.target) self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.train_op = self.optimizer.minimize( self.loss, global_step=tf.train.get_global_step() ) def predict(self, state, sess=None): sess = sess or tf.get_default_session() state = featurize_state(state) return sess.run(self.value_estimator, {self.state: state}) def update(self, state, target, sess=None): sess = sess or tf.get_default_session() state = featurize_state(state) feed_dict = {self.state: state, self.target:target} _, loss = sess.run([self.train_op, self.loss], feed_dict) def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0): """ 演员-评论员算法。处理连续型问题 """ stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes) ) Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) for i_episode in range(num_episodes): if i_episode % 10 == 0: print(i_episode) state = env.reset() spisode = [] for t in itertools.count(): action = estimator_policy.predict(state) next_state, reward, done, _ = env.step(action) spisode.append(Transition( state=state, action=action, reward=reward, next_state=next_state, done=done )) stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t value_next = estimator_value.predict(next_state) td_target = reward + discount_factor * value_next td_error = td_target - estimator_value.predict(state) estimator_value.update(state, td_target) estimator_policy.update(state, td_error, action) if done: break state = next_state return stats tf.reset_default_graph() global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyEstimator(learning_rate=0.001) value_estimator = ValueEstimator(learning_rate=0.1) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)
时刻记着自己要成为什么样的人!