动手学强化学习(二):BAM代码
一、greedy
import numpy as np import matplotlib.pyplot as plt class BernoulliBandit: """ 伯努利多臂老胡机,输入K表示拉杆个数 """ def __init__(self, K): self.probs = np.random.uniform(size=K) # 随机生成K个0~1的数,作为拉动每根拉杆的获奖 # 概率 self.best_idx = np.argmax(self.probs) # 获奖概率最大的拉杆 self.best_prob = self.probs[self.best_idx] # 最大的获奖概率 self.K = K def step(self, k): # 当玩家选择了k号拉杆后,根据拉动该老胡机的k号拉杆获得奖励的概率返回1(获奖)或0(未 # 获奖) if np.random.rand() < self.probs[k]: return 1 else: return 0 class EpsilonGreedy: """ 多臂老胡机算法基本框架 """ def __init__(self, bandit, epsilon=0.01, init_prob=1.0): self.bandit = bandit self.counts = np.zeros(self.bandit.K) # 每根拉杆的尝试次数 self.regret = 0. # 当前步的累积懊悔 self.actions = [] # 维护一个列表,记录每一步的动作 self.regrets = [] # 维护一个列表,记录每一步的累积懊悔 self.epsilon = epsilon self.estimates = np.array([init_prob] * self.bandit.K) def update_regret(self, k): # 计算累积懊悔并保存,k为本次动作选择的拉杆的编号 self.regret += self.bandit.best_prob - self.bandit.probs[k] self.regrets.append(self.regret) def run_one_step(self): # 返回当前动作选择哪一根拉杆,由每个具体的策略实现 if np.random.random() < self.epsilon: k = np.random.randint(0, self.bandit.K) # 随机选择一根拉杆 else: k = np.argmax(self.estimates) # 选择期望奖励估值最大的拉杆 r = self.bandit.step(k) # 得到本次动作的奖励 self.estimates[k] += 1. / (self.counts[k] + 1) * (r - self.estimates[k]) return k def run(self, num_steps): # 运行一定次数,num_steps为总运行次数 for _ in range(num_steps): k = self.run_one_step() self.counts[k] += 1 self.actions.append(k) self.update_regret(k) if __name__ == '__main__': np.random.seed(1) # 设定随机种子,使实验具有可重复性 K = 10 bandit_10_arm = BernoulliBandit(K) epsilon_greedy_solver = EpsilonGreedy(bandit_10_arm, epsilon=0.01) epsilon_greedy_solver.run(5000) print('epsilon-贪婪算法的累积懊悔为:', epsilon_greedy_solver.regret) plot_results([epsilon_greedy_solver], ["EpsilonGreedy"]) np.random.seed(0) epsilons = [1e-4, 0.01, 0.1, 0.25, 0.5] epsilon_greedy_solver_list = [ EpsilonGreedy(bandit_10_arm, epsilon=e) for e in epsilons ] epsilon_greedy_solver_names = ["epsilon={}".format(e) for e in epsilons] for solver in epsilon_greedy_solver_list: solver.run(5000) plot_results(epsilon_greedy_solver_list, epsilon_greedy_solver_names) class Solver: def __init__(self, bandit): self.bandit = bandit self.counts = np.zeros(self.bandit.K) # 每根拉杆的尝试次数 self.regret = 0. # 当前步的累积懊悔 self.actions = [] # 维护一个列表,记录每一步的动作 self.regrets = [] # 维护一个列表,记录每一步的累积懊悔 def update_regret(self, k): self.regret += self.bandit.best_prob - self.bandit.probs[k] self.regrets.append(self.regret) def run_one_step(self): # 返回当前动作选择哪一根拉杆,由每个具体的策略实现 raise NotImplementedError def run(self, num_steps): # 运行一定次数,num_steps为总运行次数 for _ in range(num_steps): k = self.run_one_step() self.counts[k] += 1 self.actions.append(k) self.update_regret(k) class DecayingEpsilonGreedy(Solver): def __init__(self, bandit, init_prob=1.0): super(DecayingEpsilonGreedy, self).__init__(bandit) self.estimates = np.array([init_prob] * self.bandit.K) self.total_count = 0 def run_one_step(self): self.total_count += 1 if np.random.random() < 1 / self.total_count: # epsilon值随时间衰减 k = np.random.randint(0, self.bandit.K) else: k = np.argmax(self.estimates) r = self.bandit.step(k) self.estimates[k] += 1. / (self.counts[k] + 1) * (r - self.estimates[k]) return k np.random.seed(1) decaying_epsilon_greedy_solver = DecayingEpsilonGreedy(bandit_10_arm) decaying_epsilon_greedy_solver.run(5000) print('epsilon值衰减的贪婪算法的累积懊悔为:', decaying_epsilon_greedy_solver.regret) plot_results([decaying_epsilon_greedy_solver], ["DecayingEpsilonGreedy"])
二、thomson
import numpy as np import matplotlib.pyplot as plt class Solver: """ 多臂老胡机算法基本框架 """ def __init__(self, bandit): self.bandit = bandit self.counts = np.zeros(self.bandit.K) # 每根拉杆的尝试次数 self.regret = 0. # 当前步的累积懊悔 self.actions = [] # 维护一个列表,记录每一步的动作 self.regrets = [] # 维护一个列表,记录每一步的累积懊悔 def update_regret(self, k): # 计算累积懊悔并保存,k为本次动作选择的拉杆的编号 self.regret += self.bandit.best_prob - self.bandit.probs[k] self.regrets.append(self.regret) def run_one_step(self): # 返回当前动作选择哪一根拉杆,由每个具体的策略实现 raise NotImplementedError def run(self, num_steps): # 运行一定次数,num_steps为总运行次数 for _ in range(num_steps): k = self.run_one_step() self.counts[k] += 1 self.actions.append(k) self.update_regret(k) class ThompsonSampling(Solver): """ 汤普森采样算法,继承Solver类 """ def __init__(self, bandit): super(ThompsonSampling, self).__init__(bandit) self._a = np.ones(self.bandit.K) # 列表,表示每根拉杆奖励为1的次数 self._b = np.ones(self.bandit.K) # 列表,表示每根拉杆奖励为0的次数 def run_one_step(self): samples = np.random.beta(self._a, self._b) # 按照Beta分布采样一组奖励样本 k = np.argmax(samples) # 选出采样奖励最大的拉杆 r = self.bandit.step(k) self._a[k] += r # 更新Beta分布的第一个参数 self._b[k] += (1 - r) # 更新Beta分布的第二个参数 return k class BernoulliBandit: """ 伯努利多臂老胡机,输入K表示拉杆个数 """ def __init__(self, K): self.probs = np.random.uniform(size=K) # 随机生成K个0~1的数,作为拉动每根拉杆的获奖 # 概率 self.best_idx = np.argmax(self.probs) # 获奖概率最大的拉杆 self.best_prob = self.probs[self.best_idx] # 最大的获奖概率 self.K = K def step(self, k): # 当玩家选择了k号拉杆后,根据拉动该老胡机的k号拉杆获得奖励的概率返回1(获奖)或0(未 # 获奖) if np.random.rand() < self.probs[k]: return 1 else: return 0 def plot_results(solvers, solver_names): """生成累积懊悔随时间变化的图像。输入solvers是一个列表,列表中的每个元素是一种特定的策略。 而solver_names也是一个列表,存储每个策略的名称""" for idx, solver in enumerate(solvers): time_list = range(len(solver.regrets)) plt.plot(time_list, solver.regrets, label=solver_names[idx]) plt.xlabel('Time steps') plt.ylabel('Cumulative regrets') plt.title('%d-armed bandit' % solvers[0].bandit.K) plt.legend() plt.show() if __name__ == '__main__': K = 10 bandit_10_arm = BernoulliBandit(K) np.random.seed(1) thompson_sampling_solver = ThompsonSampling(bandit_10_arm) thompson_sampling_solver.run(5000) print('汤普森采样算法的累积懊悔为:', thompson_sampling_solver.regret) plot_results([thompson_sampling_solver], ["ThompsonSampling"])
三、ucb
import numpy as np import matplotlib.pyplot as plt class Solver: def __init__(self, bandit): self.bandit = bandit self.counts = np.zeros(self.bandit.K) # 每根拉杆的尝试次数 self.regret = 0. # 当前步的累积懊悔 self.actions = [] # 维护一个列表,记录每一步的动作 self.regrets = [] # 维护一个列表,记录每一步的累积懊悔 def update_regret(self, k): # 计算累积懊悔并保存,k为本次动作选择的拉杆的编号 self.regret += self.bandit.best_prob - self.bandit.probs[k] self.regrets.append(self.regret) def run_one_step(self): # 返回当前动作选择哪一根拉杆,由每个具体的策略实现 raise NotImplementedError def run(self, num_steps): # 运行一定次数,num_steps为总运行次数 for _ in range(num_steps): k = self.run_one_step() self.counts[k] += 1 self.actions.append(k) self.update_regret(k) class UCB(Solver): def __init__(self, bandit, coef, init_prob=1.0): super(UCB, self).__init__(bandit) self.total_count = 0 self.estimates = np.array([init_prob] * self.bandit.K) self.coef = coef def run_one_step(self): self.total_count += 1 ucb = self.estimates + self.coef * np.sqrt( np.log(self.total_count) / (2 * (self.counts + 1))) # 计算上置信界 k = np.argmax(ucb) # 选出上置信界最大的拉杆 r = self.bandit.step(k) self.estimates[k] += 1. / (self.counts[k] + 1) * (r - self.estimates[k]) return k class BernoulliBandit: def __init__(self, K): self.probs = np.random.uniform(size=K) # 随机生成K个0~1的数,作为拉动每根拉杆的获奖 # 概率 self.best_idx = np.argmax(self.probs) # 获奖概率最大的拉杆 self.best_prob = self.probs[self.best_idx] # 最大的获奖概率 self.K = K def step(self, k): # 当玩家选择了k号拉杆后,根据拉动该老胡机的k号拉杆获得奖励的概率返回1(获奖)或0(未 # 获奖) if np.random.rand() < self.probs[k]: return 1 else: return 0 def plot_results(solvers, solver_names): for idx, solver in enumerate(solvers): time_list = range(len(solver.regrets)) plt.plot(time_list, solver.regrets, label=solver_names[idx]) plt.xlabel('Time steps') plt.ylabel('Cumulative regrets') plt.title('%d-armed bandit' % solvers[0].bandit.K) plt.legend() plt.show() if __name__ == '__main__': np.random.seed(1) # 设定随机种子,使实验具有可重复性 K = 10 bandit_10_arm = BernoulliBandit(K) print("随机生成了一个%d臂伯努利老胡机" % K) print("获奖概率最大的拉杆为%d号,其获奖概率为%.4f" % (bandit_10_arm.best_idx, bandit_10_arm.best_prob)) np.random.seed(1) coef = 1 # 控制不确定性比重的系数 UCB_solver = UCB(bandit_10_arm, coef) UCB_solver.run(5000) print('上置信界算法的累积懊悔为:', UCB_solver.regret) plot_results([UCB_solver], ["UCB"])