Hands-on ML and TF Chapter16 Reinforcement Learning
Policy Granients
import tensorflow as tf
reset_graph()
n_inputs = 4
n_hidden = 4
n_outputs = 1
learning_rate = 0.01
initializer = tf.variance_scaling_initializer()
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)
logits = tf.layers.dense(hidden, n_outputs)
outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)
y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(cross_entropy)
gradients = [grad for grad, variable in grads_and_vars]
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
gradient_placeholders.append(gradient_placeholder)
grads_and_vars_feed.append((gradient_placeholder, variable))
training_op = optimizer.apply_gradients(grads_and_vars_feed)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
def discount_rewards(rewards, discount_rate):
discounted_rewards = np.zeros(len(rewards))
cumulative_rewards = 0
for step in reversed(range(len(rewards))):
cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
discounted_rewards[step] = cumulative_rewards
return discounted_rewards
def discount_and_normalize_rewards(all_rewards, discount_rate):
all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
flat_rewards = np.concatenate(all_discounted_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]
env = gym.make("CartPole-v0")
n_games_per_update = 10
n_max_steps = 1000
n_iterations = 250
save_iterations = 10
discount_rate = 0.95
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
print("\rIteration: {}".format(iteration), end="")
all_rewards = []
all_gradients = []
for game in range(n_games_per_update):
current_rewards = []
current_gradients = []
obs = env.reset()
for step in range(n_max_steps):
action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
obs, reward, done, info = env.step(action_val[0][0])
current_rewards.append(reward)
current_gradients.append(gradients_val)
if done:
break
all_rewards.append(current_rewards)
all_gradients.append(current_gradients)
all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
feed_dict = {}
for var_index, gradient_placeholder in enumerate(gradient_placeholders):
mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
for game_index, rewards in enumerate(all_rewards)
for step, reward in enumerate(rewards)], axis=0)
## mean_gradients 是个标量,为一个Variable在10个episode的所有step的梯度的平均值。
feed_dict[gradient_placeholder] = mean_gradients
sess.run(training_op, feed_dict=feed_dict)
if iteration % save_iterations == 0:
saver.save(sess, "./my_policy_net_pg.ckpt")
Markov Decision Process
Learning to Play Ms. Pac-Man Using Deep Q-Learning
这部分首先解决在Windows下安装gym[atari]模块
查阅了很多资料,发现很多解决办法都是使用这个连接:
pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py
这种的方法的出处是OpenAI Gym Atari on Windows
这个gym的github issue中也提到这个方法。
这个How to Install OpenAI Gym in a Windows Environment也是使用这个方法安装atari模块
也可以在下面这个网址上下载whl进行离线安装
https://github.com/Kojoley/atari-py/releases
对于安装whl格式的文件,首先要安装wheel包
pip install wheel
由于anconda中默认安装了wheel包,所以上面的步骤可以省略。
pip install atari_py-0.1.7-cp36-cp36m-win_amd64.whl
绘制智能体和环境交互的动画
# 导入需要的包
import matplotlib
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# 将每一帧的画面存到frames中
frames = []
n_max_steps = 1000
n_change_steps = 10
obs = env.reset()
for step in range(n_max_steps):
img = env.render(mode="rgb_array")
frames.append(img)
if step % n_change_steps == 0:
action = env.action_space.sample() # play randomly
obs, reward, done, info = env.step(action)
if done:
break
# 定义绘制动画的函数
def update_scene(num, frames, patch):
patch.set_data(frames[num])
return patch,
def plot_animation(frames, repeat=False, interval=40):
plt.close() # or else nbagg sometimes plots in the previous cell
fig = plt.figure()
patch = plt.imshow(frames[0])
plt.axis('off')
return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)
# 调用函数显示动画
video = plot_animation(frames)
plt.show()
DQL算法部分
首先是对obs进行预处理,转换为88x80的灰度图像并增强对比度。
import gym
import matplotlib.pyplot as plt
import numpy as np
env = gym.make('MsPacman-v0')
obs = env.reset()
plt.imshow(obs)
plt.show()
mspacman_color = np.array([210, 164, 74]).mean()
def preprocess_observation(obs):
img = obs[1:176:2,::2] # crop and downsize
img = img.mean(axis=2) # to greyscale
img[img == mspacman_color] = 0 # improve constrast
img = (img - 128)/128 -1 # normalize from -1. to 1.
return img.reshape(88,80,1)
pre_obs = preprocess_observation(obs).reshape(88,80)
plt.imshow(pre_obs, cmap ='gray')
plt.show()
DQN算法
# ------------------------------构建DQN的计算图-----------------------------------------
from tensorflow.contrib.layers import convolution2d, fully_connected
input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8,8), (4,4), (3,3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3
conv_activation = [tf.nn.relu] * 3
n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n # 9 discrete actions are available
initializer = tf.variance_scaling_initializer()
def q_network(X_state, name):
prev_layer = X_state / 128.0 # scale pixel intensities to the [-1.0, 1.0] range.
with tf.variable_scope(name) as scope:
for n_maps, kernel_size, strides, padding, activation in zip(
conv_n_maps, conv_kernel_sizes, conv_strides,
conv_paddings, conv_activation):
prev_layer = tf.layers.conv2d(
prev_layer, filters=n_maps, kernel_size=kernel_size,
strides=strides, padding=padding, activation=activation,
kernel_initializer=initializer)
last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
activation=hidden_activation,
kernel_initializer=initializer)
outputs = tf.layers.dense(hidden, n_outputs,
kernel_initializer=initializer)
trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)
trainable_vars_by_name = {var.name[len(scope.name):]: var
for var in trainable_vars}
return outputs, trainable_vars_by_name
# --------------------------------两个DQN之间复制参数的过程----------------------------------
X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
input_channels])
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")
copy_ops = [target_var.assign(online_vars[var_name])
for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)
# ------------------------------构建actor的损失函数和优化器
learning_rate = 0.001
momentum = 0.95
with tf.variable_scope("train"):
X_action = tf.placeholder(tf.int32, shape=[None])
y = tf.placeholder(tf.float32, shape=[None, 1])
q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
axis=1, keepdims=True)
error = tf.abs(y - q_value)
clipped_error = tf.clip_by_value(error, 0.0, 1.0)
linear_error = 2 * (error - clipped_error)
loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)
global_step = tf.Variable(0, trainable=False, name='global_step')
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
training_op = optimizer.minimize(loss, global_step=global_step)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
#----------------------------replay mem的设置代替书中过的deque-----------------------------------
class ReplayMemory:
def __init__(self, maxlen):
self.maxlen = maxlen
self.buf = np.empty(shape=maxlen, dtype=np.object)
self.index = 0
self.length = 0
def append(self, data):
self.buf[self.index] = data
self.length = min(self.length + 1, self.maxlen)
self.index = (self.index + 1) % self.maxlen
def sample(self, batch_size, with_replacement=True):
if with_replacement:
indices = np.random.randint(self.length, size=batch_size) # faster
else:
indices = np.random.permutation(self.length)[:batch_size]
return self.buf[indices]
replay_memory_size = 500000
replay_memory = ReplayMemory(replay_memory_size)
#----------------------------------从replay mem中进行采样--------------------------------------
def sample_memories(batch_size):
cols = [[], [], [], [], []] # state, action, reward, next_state, continue
for memory in replay_memory.sample(batch_size):
for col, value in zip(cols, memory):
col.append(value)
cols = [np.array(col) for col in cols]
return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)
#-------------------------------epsilon贪婪策略-----------------------------------------
eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000
def epsilon_greedy(q_values, step):
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs) # random action
else:
return np.argmax(q_values) # optimal action
#-------------------------------DQN的训练过程--------------------------------------------
n_steps = 4000000 # total number of training steps
training_start = 10000 # start training after 10,000 game iterations
training_interval = 4 # run a training step every 4 game iterations
save_steps = 1000 # save the model every 1,000 training steps
copy_steps = 10000 # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.99
skip_start = 90 # Skip the start of every game (it's just waiting time).
batch_size = 50
iteration = 0 # game iterations
checkpoint_path = "./my_dqn.ckpt"
done = True # env needs to be reset
loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0
with tf.Session() as sess:
if os.path.isfile(checkpoint_path + ".index"):
saver.restore(sess, checkpoint_path)
else:
init.run()
copy_online_to_target.run()
while True:
step = global_step.eval()
if step >= n_steps:
break
iteration += 1
print("\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f} ".format(
iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
if done: # game over, start again
obs = env.reset()
for skip in range(skip_start): # skip the start of each game
obs, reward, done, info = env.step(0)
state = preprocess_observation(obs)
# Online DQN evaluates what to do
q_values = online_q_values.eval(feed_dict={X_state: [state]})
action = epsilon_greedy(q_values, step)
# Online DQN plays
obs, reward, done, info = env.step(action)
next_state = preprocess_observation(obs)
# Let's memorize what happened
replay_memory.append((state, action, reward, next_state, 1.0 - done))
state = next_state
# Compute statistics for tracking progress (not shown in the book)
total_max_q += q_values.max()
game_length += 1
if done:
mean_max_q = total_max_q / game_length
total_max_q = 0.0
game_length = 0
if iteration < training_start or iteration % training_interval != 0:
continue # only train after warmup period and at regular intervals
# Sample memories and use the target DQN to produce the target Q-Value
X_state_val, X_action_val, rewards, X_next_state_val, continues = (
sample_memories(batch_size))
next_q_values = target_q_values.eval(
feed_dict={X_state: X_next_state_val})
max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
y_val = rewards + continues * discount_rate * max_next_q_values
# Train the online DQN
_, loss_val = sess.run([training_op, loss], feed_dict={
X_state: X_state_val, X_action: X_action_val, y: y_val})
# Regularly copy the online DQN to the target DQN
if step % copy_steps == 0:
copy_online_to_target.run()
# And save regularly
if step % save_steps == 0:
saver.save(sess, checkpoint_path)
posted on 2019-05-26 14:23 Frank_Allen 阅读(386) 评论(0) 编辑 收藏 举报