强化学习-笔记

import gym
from gym import envs
env_specs = envs.registry.all()   # 查看库中都注册了哪些环境
# for e in env_specs:
#     print(e)

env = gym.make("CartPole-v1")  # 取出环境平衡车-v1
env.reset()                    # 初始化环境对象env, 返回智能体的初始观测值：array([-0.00667078, -0.04023064, -0.01557324,  0.04904377])
# env.observation_space          # 环境的观察空间：Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
# env.action_space               # 环境的动作空间：Discrete(2)
# 环境初始化后即可使用，核心：使用对象的step()方法
action = env.action_space.sample()  # step参数需要取自动作空间，可以使用sample从动作空间随机取一个动作;每次调用会使得环境前进一步，需要放在循环里完成整个回合。
# env.render()                   # 显示当前环境
# env.close()                    # 关闭当前环境

# 如果要得到更好的结果，不能在每一次都采取随机，还是要知道什么动作对环境做了什么会更好。要拿到step函数返回的东西，其会返回四个值：
# observation(object): 一个特定环境的对象，代表智能体对环境的观察，观测值，与env.reset()意义相同
# reward(float): 前一个动作的奖励量
# done(boolean): 是否需要再次重置环境
# info(dict): 对调试有用的诊断信息
env.observation_space.high

import gym
env = gym.make("CartPole-v1")
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print(observation, reward, done, info)
        if done:
            print(f"Episode finished after {t+1} timesteps")
            break
env.close()

import gym
env = gym.make("MountainCar-v0")
print(f"观测空间={env.observation_space}")
print(f"动作空间={env.action_space}")
print(f"观测范围={env.observation_space.low}~{env.observation_space.high}")
print(f"动作数={env.action_space.n}")

class BespokeAgent:
    def __init__(self, env):
        pass
    # 决策
    def decide(self, observation):
        position, velocity = observation
        lb = min(-0.09 * (position + 0.25) ** 2 + 0.03, 0.3*(position + 0.9) **4 - 0.008)
        ub = -0.07 * (position + 0.38) **2 + 0.06
        if lb < velocity < ub:
            action = 2
        else:
            action = 0
        return action
    # 学习
    def learn(self, *args):
        pass
agent = BespokeAgent(env)

def play_montecarlo(env, agent, render=True, train=False):
    episode_reward = 0.
    observation = env.reset()
    while True:
        if render:
            env.render()    # 显示图形界面
        action = agent.decide(observation)     # 智能体决定采取的动作
        next_observation, reward, done, _ = env.step(action)     # 执行动作
        episode_reward += reward            # 收集回合奖励
        if train:
            agent.learn(observation, action, reward, done) # 学习
        if done:
            break
        observation = next_observation
    return episode_reward      # 返回回合总奖励
# env.seed(0)
# 执行一次
episode_reward = play_montecarlo(env, agent, train=True)
print(f"回合奖励:{episode_reward}")
env.close()

# 连续交互100回合奖励
import numpy as np
episode_rewards = [play_montecarlo(env, agent, train=True) for _ in range(10)]
print(f"平均回合奖励={np.mean(episode_rewards)}")
env.close()

离散markov决策

import gym
env = gym.make("CliffWalking-v0")
print(f"观测空间={env.observation_space}")
print(f"动作空间={env.action_space}")
print(f"状态数量={env.nS} 动作数量={env.nA}")
print(f"地图大小={env.shape}")

import numpy as np
def play_once(env, policy):
    total_reward = 0
    state = env.reset()
    loc = np.unravel_index(state, env.shape)
    print(f"状态={state}, 位置={loc}")
    while True:
        action = np.random.choice(env.nA, p=policy[state])  # 0-4随机采样，以特定概率分布采样，即每次拿出最优动作
        # 执行step，环境做出反馈
        next_state, reward, done, _ = env.step(action)
        print(f"状态：{state} 位置：{loc} 奖励：{reward}")
        total_reward += reward
        if done:
            break
        state = next_state
    return total_reward
    
actions = np.ones(env.shape, dtype=int)
actions[-1, :] = 0
actions[:, -1] = 2
optimal_policy = np.eye(4)[actions.reshape(-1)]    # 生成最优动作向量
total_rewards = play_once(env, optimal_policy)
print(f"总奖励：{total_rewards}")

def evaluate_bellman(env, policy, gamma=1.):
    a, b = np.eye(env.nS), np.zeros((env.nS))     # 分别表示状态转移和状态价值
    for state in range(env.nS-1):
        for action in range(env.nA):
            pi = policy[state][action]
            # print(f"{state} {action}: pi: {pi}")    # 表示每一状态下各个动作产生的概率， env.P 存储环境的动力
            for p, next_state, reward, done in env.P[state][action]:
                a[state, next_state] -= (pi * gamma)
                b[state] += (pi * reward * p)
    # 求解线性方程组,得到状态价值
    v = np.linalg.solve(a,b)
    print(v)
    q = np.zeros((env.nS, env.nA))
    # 利用状态价值再求解动作价值
    for state in range(env.nS-1):
        for action in range(env.nA):
            for p, next_state, reward, done, in env.P[state][action]:
                q[state][action] += ((reward + gamma*v[next_state])*p)
    return v, q
policy = np.random.uniform(size=(env.nS, env.nA))
policy = policy / np.sum(policy, axis=1)[:, np.newaxis]   # 外层新加一维度
state_values, action_values = evaluate_bellman(env, policy)
print(f"状态价值={state_values}\n动作价值={action_values}")

# 评估最优策略
optimal_state_values, optimal_action_values = evaluate_bellman(env, optimal_policy)
print(f"最优状态价值={optimal_state_values}\n最优动作价值={optimal_action_values}")

###########################################################################

GridWord 实验

1. 策略价值估计

# 策略评价
def policy_value(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Func: 给定环境的动力系统和一个策略，进行策略评估
    policy: 大小为|S|*|A|，表示智能体的行动策略
    env: env.P 存储状态转移元组:(prob, next_state, reward, done)
         env.nS 环境的状态数
         env.nA 智能体的动作数
    discount_fator: 累积回报折扣因子
    theta: 控制迭代停止条件.一旦所有状态的价值更新小于该值时，停止评价
    """
    V = np.zeros(env.nS)  # 价值初始化=0
    while True:
        delta = 0
        # 遍历每一个状态
        for s in range(env.nS):
            v = 0
            # 查找下一个可能的动作
            for a, action_prob in enumerate(policy[s]):
                # 对于每一个动作，查找一下个可能的状态
                for prob, next_state, reward, done in env.P[s][a]:
                    # 计算期望价值
                    v += action_prob * prob * (reward + discount_factor*V[next_state])   # 使用”自举“,一下状态的价值用之前策略估计
            delta = max(delta, np.abs(v-V[s]))
            # 更新V[s]
            V[s] = v
        if delta < theta:
            break
    return np.array(V)

random_policy = np.ones([env.nS, env.nA]) / env.nA
v = policy_value(random_policy, env)
print(f"价值函数:{v.reshape(env.shape)}")

2.策略迭代

策略迭代和价值迭代都可以求解马尔科夫控制问题。策略迭代包含两个部分:1)策略评估；2)策略迭代。价值迭代直接使用贝尔曼最优方程求解，寻找最佳价值函数，对应最佳策略

# 价值迭代
def value_iteration(env, discount_factor=1.0, theta=0.00001):
    """ 价值迭代方法
    """
    def one_step_lookahead(state, V):   # 一步先行值
        """
        给定一个状态，计算其对应所有的动作的价值
        Args:
            state: 要考虑的状态
            V: 作为一个估计器计算值
        Returns:
            长度为nA的向量，包含所有动作的期望值
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                # 动作已经确定，不需要action_prob
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    V = np.zeros(env.nS)
    while True:
        # 更新停止条件
        delta = 0
        # 更新每一个状态
        for s in range(env.nS):
            # 获得该状态下的各动作价值
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)    # 此处是取值，不是动作
            delta = max(delta, np.abs(best_action_value - V[s]))
            # 更新价值函数
            V[s] = best_action_value
        if delta < theta:
            break
    # 使用最优价值函数创建一个确定性策略
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        # 执行一步向前看，发现此状态的最佳动作
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        # 总是采取最优动作
        policy[s, best_action] = 1.0
    return policy, V

3. 赌徒问题

def value_iteration_gambers(p_h, discount_factor=1.0, theta=0.0001):
    """ p_h: coin 朝上的概率
    """
    # 只有在100到达goal奖励为1，其余为0
    rewards = np.zeros(101)
    rewards[100] = 1
    # 加入两个虚拟状态:0 和 100
    V = np.zeros(101)
    def one_step_lookahead(s, V, rewards):
        """
            s: gamber的资金，资本
            V: 每个状态的价值
            rewards: 价值向量
        """
        A = np.zeros(101)
        stakes = range(1, min(s, 100-s)+1)     # 赌注，最小为1，最大为min(s, 100-s)
        for a in stakes:
            # rewards[s+a],rewards[s-a] 即时奖励
            # V[s+a],V[s-a] 下一状态的价值
            # 每一赌注的期望价值，赌赢了；赌输了
            A[a] = p_h * (rewards[s+a] + V[s+a]*discount_factor) + (1-p_h) * (rewards[s-a] + V[s-a]*discount_factor)
        return A
    
    while True:
        delta = 0
        for s in range(1, 100):
            A = one_step_lookahead(s, V, rewards)
            # print(A, s, V)，选择价值最大的动作
            best_action_value = np.max(A)
            delta = max(delta, np.abs(best_action_value - V[s]))
            V[s] = best_action_value
        if delta < theta:
            break
    # 使用最优值构建确定策略
    policy = np.zeros(100)
    for s in range(100):
        A = one_step_lookahead(s, V, rewards)
        best_action = np.argmax(A)
        policy[s] = best_action
    return policy, V
policy, v = value_iteration_gambers(0.25)
print(f"Optimized Policy:{policy}")
print(f"Optimized Value Function:{v}")

MC算法

import gym
import numpy as np
from collections import defaultdict
env = gym.make("Blackjack-v1")
env.reset()

def print_observation(observation):
    score, dealer_score, usable_ace = observation
    print(f"Player Score（玩家得分）:{score}, Usable Ace（手上是否有ace）:{usable_ace}, Dealer Score（庄家得分）:{dealer_score}")
    
def strategy(observation):
    score, dealer_score, usable_ace = observation
    # 简单策略：大于20不要得分为0， 否则就要得分为1
    return 0 if score >= 20 else 1

for i_episode in range(2):
    observation = env.reset()
    for t in range(2):
        print_observation(observation)
        action = strategy(observation)
        print("采取的动作:{}".format(["不要","要"][action]))
        # 走一步，得到环境反馈
        observation, reward, done, _ = env.step(action)
        if done:
            print_observation(observation)
            print("Game end. Reward:{}\n".format(float(reward)))

def mc_prediction(policy, env, num_episodes, discount_factor=1.0):
    """
        蒙特卡洛算法。根据特定策略采样，计算价值函数
        num_episodes: 采样的回合数
        返回：状态-值 字典
    """
    return_sum = defaultdict(float)
    return_count = defaultdict(float)
    V = defaultdict(float)  # 值函数
    for i_episode in range(1, num_episodes+1):
        # 生成一个回合数据：(state, action, reward)
        episode = []
        state = env.reset()
        for i in range(100):
            action =  policy(state)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        # 在此回合中，找到所有已访问的状态放入集合
        states_in_episode = set([tuple(e[0]) for e in episode])
        # print(f"episode:{episode}\nstates_in_episode:{states_in_episode}\n")
        for state in states_in_episode:
            # 发现在回合中首次出现该状态的索引
            first_occurence_idx = next(i for i,x in enumerate(episode) if x[0]==state)
            # 从首次发现开始对奖励求和
            G = sum([x[2]*(discount_factor**i) for i,x in enumerate(episode[first_occurence_idx:])])
            # 用所有采样的回合，计算该状态的平均回报
            return_sum[state] += G
            return_count[state] += 1
            V[state] = return_sum[state] / return_count[state]
    return V
        
def sample_policy(observation):
    # 返回0或1
    score, dealer_score, usable_ace = observation
    return 0 if score >= 20 else 1

V_10k = mc_prediction(sample_policy, env, int(10e4))

MC-episode greedy policies

def make_episode_greedy_policy(Q, epsilon, nA):
    """
        基于给定Q函数和episode创建贪婪策略
        Q:状态-动作值       episode: 选择一个随机动作的概率     nA: 环境中动作的数量
    """
    def policy_fn(observation):
        # observation: 环境描述(玩家得分, 庄家得分, 是否有ace)
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)        # 最大动作基于高概率
        return A
    return policy_fn

def mc_control_episode_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):
    """
        蒙特卡洛控制-使用epsilon贪婪策略：找对最优的epsilon-greedy policy
    """
    return_sum = defaultdict(float)
    return_count = defaultdict(float)
    # 动作价值函数Q: state -> (action -> action-value)
    Q = defaultdict(lambda: np.zeros(env.action_space.n))    # 初始化
    # 生成策略
    policy = make_episode_greedy_policy(Q, epsilon, env.action_space.n)
    for i_episode in range(1, num_episodes+1):
        # 生成一个回合数据
        episode = []
        state = env.reset()
        for i in range(100):
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)      # 特定概率抽取动作
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        # 找到采样出的状态-动作对
        sa_in_episode = set([(tuple(e[0]), e[1]) for e in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            first_occurance_idx = next(i for i,x in enumerate(episode) if x[0]==state and x[1]==action)
            # 计算总奖励
            G = sum([e[2] * (discount_factor**i) for i,e in enumerate(episode[first_occurance_idx:])])
            # 计算回报
            return_sum[sa_pair] += G
            return_count[sa_pair] += 1
            # 动作价值Q
            Q[state][action] = return_sum[sa_pair] / return_count[sa_pair]
        # 通过改变Q就将策略更新了
    return Q, policy
Q, policy = mc_control_episode_greedy(env, 5000)

带有权重的离线MC控制

def create_random_policy(nA):
    """ 随机策略函数
        返回：动作概率向量
    """
    A = np.ones(nA, dtype=float) / nA
    def policy_fn(observation):
        return A
    return policy_fn

def create_greedy_policy(Q):
    """ 基于Q值创建贪婪策略
    """
    def policy_fn(state):
        A = np.zeros_like(Q[state], dtype=float)
        best_action = np.argmax(Q[state])
        A[best_action] = 1.0
        return A
    return policy_fn

def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):
    """ mc权重控制-离线策略
    """
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    # 加权重要性的累积分母
    C = defaultdict(lambda: np.zeros(env.action_space.n))
    # 想要学习的目标策略，二维[0, 0]
    target_policy = create_greedy_policy(Q)
    for i_episode in range(1, num_episodes+1):
        # 创建一个回合
        episode = []
        state = env.reset()
        for i in range(100):
            probs = behavior_policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        G, W = 0.0, 1.0   # W：重要度采样率
        # 从后往前遍历数据
        for t in range(len(episode))[::-1]:
            state, action, reward = episode[t]
            # 从t时候更新总奖励
            G = discount_factor * G + reward
            # 更新权重
            C[state][action] += W
            # 更新状态价值
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            # 如果action不是目标策略，则中断
            if action != np.argmax(target_policy(state)):
                break
            W = W * 1.0/behavior_policy(state)[action]
    return Q, target_policy
        
random_policy = create_random_policy(env.action_space.n)    
Q, policy = mc_control_importance_sampling(env ,1000, random_policy)

SARSA

from lib.envs.gridworld import GridworldEnv
from lib import plotting
import numpy as np
import itertools
from collections import defaultdict

env = GridworldEnv()
print(f"状态数：{env.nS}, 动作数：{env.nA}, 动力系统:状态0的转移信息:{env.P[0]}")

def make_epsilon_greedy_policy(Q, epsilon, nA):
    """ 根据给定的Q和epsilon产生一个贪婪策略
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])    # 获取Q函数中当前状态下概率最大的那个动作
        A[best_action] += (1 - epsilon)            # 保证概率分布
        return A
    return policy_fn

def sarsa(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    """ sarsa: on-policy TD control. 找到最优的epsilon贪婪策略
        同策略：使用一种策略选取动作，也用一种策略去优化
    """
    Q = defaultdict(lambda :np.zeros(env.action_space.n))   # 初始Q
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))  # 状态追踪
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)     # 沿用的策略
    for i_episode in range(num_episodes):
        state = env.reset()
        action_probs = policy(state)
        # print(action_probs)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)   # 依据分布采样
        for t in itertools.count():      # 无限迭代知道满足条件
            # 执行一步动作
            next_state, reward, done, _ = env.step(action)
            # 贪婪得到下一动作
            next_action_probs = policy(next_state)
            next_action = np.random.choice(np.arange(len(action_probs)), p=next_action_probs)
            # 状态跟踪
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] += 1
            # TD更新
            td_target = reward + discount_factor * Q[next_state][next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta
            if done:
                break
            action = next_action
            state = next_state
    return Q, stats
Q, stats = sarsa(env, 200)

Q-learning

def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
    """ q-learning: off-policy TD control. epsilon贪婪策略找到最优策略
        异策略：目标策略、行为策略。动作探索使用epsilon贪婪，目标Q优化使用下一状态价值最大的动作（不需要知道下一动作）
    """
    Q = defaultdict(lambda :np.zeros(env.action_space.n))   # 初始Q
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes))  # 状态追踪
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)     # 沿用的策略
    for i_episode in range(num_episodes):
        state = env.reset()
        for t in itertools.count():      # 无限迭代知道满足条件
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)   # 动作探索
            # 执行一步动作
            next_state, reward, done, _ = env.step(action)
            # 状态跟踪
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] += 1
            # TD更新
            best_next_action = np.argmax(Q[next_state])   # 下一个状态的最佳动作
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta
            if done:
                break
            state = next_state
    return Q, stats
Q, stats = q_learning(env, 200)

Deep Q-learning

import gym
import os
import random
import numpy as np
import tensorflow as tf
from lib import plotting

env = gym.make("Breakout-v0")
VALID_ACTIONS = [0, 1, 2, 3]   # 实施动作，0:无，1:攻击，2:左，3:右

class StateProcessor():
    """ 画面预处理，变换图像大小并转换到灰度
    """
    def __init__(self):
        # tf 图构建
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)    # 单通道
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)     # 目标：160*160
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)
            print(self.output)
    def process(self, sess, state):
        # (210, 160, 3) -> (84, 84)
        return sess.run(self.output, {self.input_state: state})
sp = StateProcessor()

tf.reset_default_graph()
class Estimator():
    """Q-价值估计网络。用于Q-Network 和 Target Network
    """
    def __init__(self, scope="estimator", summaries_dir=None):
        self.scope = scope
        self.summary_writer = None
        with tf.variable_scope(scope):
            self._build_model()
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)
                
    def _build_model(self):
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name='X')
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name='y')    # TD target value
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name='actions')   # 选择一个动作
        
        X = tf.to_float(self.X_pl) / 255.0
        batch_size = tf.shape(self.X_pl)[0]
        
        # conv 层
        conv1 = tf.contrib.layers.conv2d(
            X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))
        print("predictions:", self.predictions)
        # 四个帧作为一个状态，选择这个动作作为4帧数据的标签索引
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        print("gather_indices:", gather_indices)
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
        print("action_predictions", self.action_predictions)
        
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
        
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])
        
    def predict(self, sess, s):
        """ 预测动作值
        """
        return sess.run(self.predictions, {self.X_pl: s})
    
    def update(self, sess, s, a, y):
        """ 更新模型，计算batch loss
            s: 输入状态[batch_size, 4, 84, 84, 1]
            a: 选择的动作
            y: 目标动作
        """
        feed_dict = {self.X_pl: s, self.y_pl: y, self.actions_pl: a}
        summaries, global_step, _, loss = sess.run(
        [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
        feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss
    

# Test
tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)

e = Estimator(scope='test')
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    observation = env.reset()
    print("observation:", np.shape(observation))
    observation_p = sp.process(sess, observation)
    observation = np.stack([observation_p] * 4, axis=2)
    observations = np.array([observation] * 3)
    
    print(e.predict(sess, observations))
    
    y = np.array([10.0, 10.0, 11.0])
    a = np.array([1, 3, 2])
    print(e.update(sess, observations, a, y))


# 模型拷贝
class ModelParametersCopier():
    def __init__(self, estimator1, estimator2):
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
        e2_params = sorted(e2_params, key=lambda x: x.name)
        e1_params = sorted(e1_params, key=lambda x: x.name)
        
        self.update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assgin(e1_v)
            self.update_ops.append(op)
            
    def make(self, sess):
        """ 执行copy
        """
        sess.run(self.update_ops)
        
class make_epsilon_greedy_policy(estimator, nA):
    """ 给定Q-fun, epsilon, 执行贪婪策略
        返回：状态-动作的概率分布
    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def deep_q_learning(sess,
                   env,
                   q_estimator,
                   target_estimator,
                   num_epsidoes,
                   experiment_dir,
                   replay_memory_size=50000
                   replay_memory_init_size=5000,
                   update_target_esitmator_every=10000
                   discount_factor=0.99,
                   epsilon_start=1.0,
                   epsilon_end=0.1,
                   epsilon_decay_steps=500000,
                   batch_size=32,
                   record_video_every=50):
    """ 异策略TD控制的Q学习
        返回：EpisodeStats对象
    """
    # 状态转移
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    # 回放内存
    replay_memory = []
    # 复制模型
    estimator_copy = ModelParameterCopier(q_estimator, target_estimator)
    # 追踪有用的状态
    stats = plotting.EpisodeStats(epsiode_lengths=np.zeros(num_epsiodes),
                                 epsiode_rewards=np.zeros(num_epsidoes))
    # 便于查看健康状态
    current_process = %psutil.Process()
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)
    saver = tf.train.Saver()
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("load model checkpoint {}".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    # 得到当前时间步长
    total_t = sess.run(tf.contrib.framework.get_global_step())
    # epsilon 衰减时间表
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    # 贪婪策略
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))
    # 用初始经验填充回放内存区
    print("Polulating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state]*4, axis=2)   # 使用4帧数据
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)   # 预处理
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        
        if len(replay_memory) == replay_memory_init_size:
            replay_memory.pop(0)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        
        # 状态更新
        stats.episode_rewards[i_episode] += reward
        stats.episode_lengths[i_episode] = t
        # 经验回放采样
        samples = random.sample(replay_memory, batch_size)
        states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))
        # 计算Q值和目标值
        q_values_next = target_estimator.predict(sess, next_states_batch)
        targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
        # 梯度下降更新
        states_batch = np.array(states_batch)
        loss = q_estimator.update(sess, states_batch, action_batch, targets_batch)
        
        if done:
            break
        state = next_state
        total_t += 1
        
        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])
        
    return stats
global_step = tf.Variable(0, name='global_step', trainable=False)
q_estimator = Estimator(scope="q_estimator")
target_estimator = Estimator(scope="target_q")
state_processor = StateProcessor()

REINFORCE

"""
@Date   ：2022/11/2
@Author ：d
"""
import gym
import collections
import numpy as np
import itertools
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from lib import plotting

env = gym.make("CliffWalking-v0")
env.reset()
env.render()
print(env.step(0))
env.render()
print(env.step(1))
env.render()

class PolicyEstimator():
    """:策略函数估计器
    """
    def __init__(self, learning_rate=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.action = tf.placeholder(dtype=tf.int32, name="action")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # 查表估计
            state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
            self.output_layer = tf.layers.dense(inputs=tf.expand_dims(state_one_hot, 0),
                                                units=env.action_space.n, 
                                                kernel_initializer=tf.zeros_initializer,
                                               activation=None)
            self.action_probs = tf.squeeze(tf.nn.softmax(self.output_layer))    # 变成一维向量
            self.picked_action_prob = tf.gather(self.action_probs, self.action)
            self.loss = -tf.math.log(self.picked_action_prob) * self.target

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.train.get_global_step()
            )

    def predict(self, state, sess=None):
        # 预测只接受状态
        sess = sess or tf.get_default_session()
        return sess.run(self.action_probs, {self.state: state})

    def update(self, state, target, action, sess=None):
        # 更新需要状态、动作、目标
        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state,
                     self.action: action,
                     self.target: target,
                     }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

p = PolicyEstimator()
class ValueEstimator():
    """ 价值评估器,add baseline estimator
    """
    def __init__(self, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # 将状态值映射到one-hot
            state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
            self.output_layer = tf.layers.dense(
                inputs=tf.expand_dims(state_one_hot, 0),
                units=1,
                kernel_initializer=tf.zeros_initializer,
                activation=None
            )
            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimate, self.target)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.train.get_global_step()
            )

    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        return sess.run(self.value_estimate, {self.state: state})

    def update(self, state, target, sess=None):
        sess = sess or tf.get_default_session()
        feed_dict = {self.state: state, self.target: target}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

def reinforce(env, estimator_policy, estimator_value, num_episode, discount_factor=1.0):
    """ 蒙特卡洛策略梯度算法-使用策略梯度优化策略估计器
    """
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episode),
                                  episode_rewards=np.zeros(num_episode))
    Transition = collections.namedtuple("Transition",
                                        ["state", "action", "reward", "next_state", "done"])
    for i_episode in range(num_episode):
        print(f"{i_episode}:episode")
        state = env.reset()
        if i_episode % 100 == 0:
            print(state)
        episode = []
        # t个时间序列
        for t in itertools.count():
            # 执行一步
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            # 状态信息记录
            episode.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done))
            stats.episode_lengths[i_episode] = t
            stats.episode_rewards[i_episode] += reward

            if done:
                break
            state = next_state
        # 浏览当前回合并进行策略更新
        for t, transition in enumerate(episode):
            # 当前时间步的回报
            total_return = sum(discount_factor**i*e.reward for i,e in enumerate(episode[t:]))
            # 计算 baseline/advantage
            baseline_value = estimator_value.predict(transition.state)
            advantage = total_return - baseline_value
            estimator_value.update(transition.state, total_return)
            estimator_policy.update(transition.state, advantage, transition.action)
    return stats

tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator()
value_estimator = ValueEstimator()

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    stats = reinforce(env, policy_estimator, value_estimator, 10, discount_factor=1.0)

plotting.plot_episode_stats(stats, smoothing_window=25)

演员-评论员算法

def actor_critic(env, estimator_policy, estimator_value, num_episode, discount_factor=1.0):
    """演员评价价值网络；评论员评价策略网络；TD error作为策略优化目标， TD target作为价值优化目标
    """
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episode),
                                  episode_rewards=np.zeros(num_episode))
    Transition = collections.namedtuple("Transition",
                                        ["state", "action", "reward", "next_state", "done"])
    for i_episode in range(num_episode):
        state = env.reset()
        if i_episode % 100 == 0:
            print(f"{i_episode}:episode")
            print(state)
        episode = []
        # t个时间序列
        for t in itertools.count():
            # 执行一步
            action_probs = estimator_policy.predict(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            # 状态信息记录
            episode.append(Transition(state=state, action=action, reward=reward, next_state=next_state, done=done))
            stats.episode_lengths[i_episode] = t
            stats.episode_rewards[i_episode] += reward
            # 构建TD误差
            value_next = estimator_value.predict(next_state)
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(state)
            # 更新评论员 td_target 作为目标
            estimator_value.update(state, td_target)
            # 更新演员 td_error 作为优势估计
            estimator_policy.update(state, td_error, action)

            if done:
                break
            state = next_state
    return stats

演员-评论员算法解决连续动作问题

import gym
import collections
import numpy as np
import itertools
import sklearn
from sklearn import preprocessing, pipeline
from lib import plotting
from sklearn.kernel_approximation import RBFSampler
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

env = gym.make("MountainCarContinuous-v0")
print(env.observation_space.sample())

# 采样状态空间中的一些例子，标准化操作
observation_examples = np.array([env.observation_space.sample() for _ in range(1000)])
scaler = preprocessing.StandardScaler()
scaler.fit(observation_examples)

# 转换状态到特征
featurizer = pipeline.FeatureUnion([
    ('rbf1', RBFSampler(gamma=5.0, n_components=100)),
    ('rbf2', RBFSampler(gamma=2.0, n_components=100)),
    ('rbf3', RBFSampler(gamma=1.0, n_components=100)),
    ('rbf4', RBFSampler(gamma=0.5, n_components=100)),

])
featurizer.fit(scaler.transform(observation_examples))

def featurize_state(state):
    """ 返回一个状态的特征表征，生成400dim向量
    """
    scaled = scaler.transform([state])
    featurized = featurizer.transform(scaled)
    return featurized[0]
# featurize_state(env.observation_space.sample())

class PolicyEstimator():
    """ 策略估计模型
    """
    def __init__(self, learning_rate=0.01, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [400], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            self.mu = tf.layers.dense(inputs=tf.expand_dims(self.state, 0),
                                      units=1)
            self.mu = tf.squeeze(self.mu)

            self.sigma = tf.layers.dense(inputs=tf.expand_dims(self.state, 0),
                                         units=1)
            self.sigma = tf.squeeze(self.sigma)
            self.sigma = tf.nn.softplus(self.sigma) + 1e-5
            self.normal_dist = tf.distributions.Normal(self.mu, self.sigma)
            self.action = self.normal_dist._sample_n(1)
            self.action = tf.clip_by_value(self.action, env.observation_space.low[0], env.observation_space.high[0])

            # loss and train
            self.loss = -self.normal_dist.log_prob(self.action) * self.target
            self.loss -= 1e-1 * self.normal_dist.entropy()

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.train.get_global_step()
            )

    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        return sess.run(self.action, {self.state: state})

    def update(self, state, target, action, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        feed_dict = {self.state: state, self.target: target, self.action: action}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

class ValueEstimator():
    """ J价值函数估计
    """
    def __init__(self, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [400], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            self.output_layer = tf.layers.dense(
                inputs=tf.expand_dims(self.state, 0),
                units=1
            )
            self.value_estimator = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimator, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.train.get_global_step()
            )

    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        return sess.run(self.value_estimator, {self.state: state})

    def update(self, state, target, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        feed_dict = {self.state: state, self.target:target}
        _, loss = sess.run([self.train_op, self.loss], feed_dict)

def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    """ 演员-评论员算法。处理连续型问题
    """
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes)
    )
    Transition = collections.namedtuple("Transition",
                                        ["state", "action", "reward", "next_state", "done"])
    for i_episode in range(num_episodes):
        if i_episode % 10 == 0:
            print(i_episode)
        state = env.reset()
        spisode = []
        for t in itertools.count():
            action = estimator_policy.predict(state)
            next_state, reward, done, _ = env.step(action)
            spisode.append(Transition(
                state=state, action=action, reward=reward, next_state=next_state, done=done
            ))
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            value_next = estimator_value.predict(next_state)
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(state)

            estimator_value.update(state, td_target)
            estimator_policy.update(state, td_error, action)

            if done:
                break
            state = next_state
    return stats

tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator(learning_rate=0.001)
value_estimator = ValueEstimator(learning_rate=0.1)
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)

posted @ 2022-10-18 18:20 今夜无风阅读(103) 评论(0) 编辑收藏举报

刷新页面返回顶部