强化学习代码实战-01无状态问题(多臂赌博机)

import numpy as np
import random

probs = np.random.uniform(size=10)
rewards = [[1] for _ in range(10)]
probs, rewards

# 贪婪算法
def greedy_choose_one():
    # 以某一小概率随机选择一根拉杆
    if random.random() < 0.01:
        return random.randint(0,9)
    # 计算每个老H机的平均奖励
    rewards_mean = [np.mean(i) for i in rewards]
    # 选择期望奖励最大的拉杆
    return np.argmax(rewards_mean)

# 递减的贪婪算法(随着玩的次数增加,探索力度要降低)
def decay_choose_one():
    play_count = sum([len(i) for i in rewards])
    if random.random() < 1/play_count:
        return random.randint(0,9)
    rewards_mean = [np.mean(i) for i in rewards]
    return np.argmax(rewards_mean)

# 上置信界算法(细化到每个老H机探索和利用的情况)
def upon_confidence_choose():
    """参考:https://www.cnblogs.com/Ryan0v0/p/11366578.html
    """
    # 求每个老H机各玩了多少次
    play_count = [len(i) for i in rewards]
    play_count = np.array(play_count)
    
    # 分子,总共玩了多少次,开根号让其增长速度变慢
    fenzi = play_count.sum() ** 0.5
    # 分母,每台老H机玩的次数,乘以2让其增长速度变快
    fenmu = play_count * 2
    ucb = fenzi / fenmu
    # 开根号,大于1的数会被缩小,小于1的数会被放大,让ucb维持在一定区间范围
    ucb = ucb ** 0.5
    
    rewards_mean = np.array([np.mean(i) for i in rewards])
    # 带有ucb的平均奖励
    rewards_mean += ucb
    return rewards_mean.argmax()
    
# beta分布测试
def beta_test():
    # 分布的分布,参数是二项分布的成功或失败次数
    print(f"当数字小的时候,beta分布的概率有很大随机性")
    for _ in range(5):
        print(np.random.beta(1,1))
    print(f"当数字大的时候,beta分布逐渐稳定")
    for _ in range(5):
        print(np.random.beta(1e5, 1e5))

def thompson_choose_one():
    """ 参考:https://blog.csdn.net/qq_24434491/article/details/114319241
    """
    # 统计当前奖励为1的次数+1
    get_reward_count = [sum(i)+1 for i in rewards]
    loss_reward_count = [sum(1-np.array(i))+1 for i in rewards]
    # 按照beta分布计算奖励分布,可以认为是每一台老H机中奖的概率
    beta = np.random.beta(get_reward_count, loss_reward_count)
    return beta.argmax()

def try_and_play():
#     i = choose_one()
#     i = decay_choose_one()
#     i = upon_confidence_choose()
    i = thompson_choose_one()
    # 玩老H机,得到结果
    reward = 0
    if random.random() < probs[i]:
        reward = 1
    # 记录玩的结果
    rewards[i].append(reward)
    
def play_N(num):
    # 玩N次
    for _ in range(num):
        try_and_play()
    # 期望的最好结果
    target = probs.max() * num
    # 实际玩出的结果
    result = sum([sum(i) for i in rewards])
    return target, result
play_N(5000)

 

posted @ 2022-11-08 15:55  今夜无风  阅读(76)  评论(0编辑  收藏  举报