贝叶斯推断|朴素贝叶斯分类|贝叶斯定理
近期,由于项目需求,需要用到贝叶斯定理及其相关知识,于是又系统的学习了一下,顺便做一下笔记。
参考资料:
- 概率论与数理统计的基础知识,这个PPT讲得非常通俗易懂,复习大学学过的知识
- 算法杂货铺——分类算法之朴素贝叶斯分类(Naive Bayesian classification) 理论讲的非常详细清楚
- 朴素贝叶斯的学习与分类 讲的非常精炼,代码用python实现得比较好
代码(非常详细的注释):
#-*- coding:utf-8 -*- import copy #用于深度拷贝,适用于复杂的数据结构 #复杂的数据结构看不懂,一定要在纸上画图,画出来就一目了然了 class native_bayes: def __init__(self, character_vec_, class_vec_): """ # 缩进必须正确,不然会报错 构造函数,传入的参数请看最底下的函数调用 character_vec_:[("character_A",["A1","A2","A3"]), ("character_B",["B1","B2","B3"])] 是一个嵌套数据结构,最外层是一个列表,内层是元组,元组里还有列表 class_vec_:["class_X", "class_Y"] """ character_condition_per = {} #创建一个数据结构,建议在纸上画出结构图 #这是一个嵌套的三层字典,用于统计计数 for character_name in character_vec_: character_condition_per[character_name[0]] = {} for character_value in character_name[1]: character_condition_per[character_name[0]][character_value] = { 'num':0, # 记录该类别下该特征值在训练样本中的数量 'condition_per':0.0 # 记录该类别下各个特征值的条件概率 } self.class_set = {} # 记录该类别下各个特征值的条件概率 #这是一个两层字典,内嵌一个三层字典 for class_name in class_vec_: self.class_set[class_name] = { 'num':0, # 记录该类别在训练样本中的数量 'class_per':0.0, # 记录该类别在训练样本中的先验概率 'character_condition_per':copy.deepcopy(character_condition_per) #将上面的三层字典全部嵌套过来了 } #print("init", character_vec_, self.class_set) #for debug def learn(self, sample_): """ learn是训练函数,传入的参数为sample_: [ { 'character' : {'character_A':'A1'}, #特征向量 'class_name' : 'class_X' #类别名称 } ] """ for each_sample in sample_: character_vec_ = each_sample['character'] class_name = each_sample['class_name'] data_for_class = self.class_set[class_name] data_for_class['num'] += 1 # 各个特质值样本数量加1 for character_name in character_vec_: #默认迭代的字典的键 character_value = character_vec_[character_name] data_for_character = data_for_class['character_condition_per'][character_name][character_value] data_for_character['num'] += 1 # 数量计算完毕, 计算最终的概率值 sample_num = len(sample_) for each_sample in sample_: character_vec_ = each_sample['character'] class_name = each_sample['class_name'] data_for_class = self.class_set[class_name] # 计算类别的先验概率 data_for_class['class_per'] = float(data_for_class['num'])/sample_num # 各个特质值的条件概率 for character_name in character_vec_: character_value = character_vec_[character_name] data_for_character = data_for_class['character_condition_per'][character_name][character_value] data_for_character['condition_per'] = float(data_for_character['num'] / data_for_class['num']) # from pprint import pprint # pprint(self.class_set) #for debug def classify(self, input_): """ 分类函数:输入参数input_: { "character_A":"A1", "character_B":"B3", } """ best_class = '' max_per = 0.0 for class_name in self.class_set: class_data = self.class_set[class_name] per = class_data['class_per'] # 计算各个特征值条件概率的乘积 for character_name in input_: character_per_data = class_data['character_condition_per'][character_name] per = per * character_per_data[input_[character_name]]['condition_per'] print (class_name, per) if per >=max_per: best_class = class_name return best_class #命名规则:函数参数后面加_,正常的则不加,非常容易区分 #台头 character_vec = [("character_A",["A1","A2","A3"]),("character_B",["B1","B2","B3"])] class_vec = ["class_X","class_Y"] bayes = native_bayes(character_vec, class_vec) #创建对象 sample = [ #创建训练集 { 'character' : {'character_A':'A1', 'character_B':'B1'}, #特征向量 'class_name' : 'class_X' #类别名称 }, { 'character' : {'character_A':'A3', 'character_B':'B1'}, #特征向量 'class_name' : 'class_X' #类别名称 }, { 'character' : {'character_A':'A3', 'character_B':'B3'}, #特征向量 'class_name' : 'class_X' #类别名称 }, { 'character' : {'character_A':'A2', 'character_B':'B2'}, #特征向量 'class_name' : 'class_X' #类别名称 }, { 'character' : {'character_A':'A2', 'character_B':'B2'}, #特征向量 'class_name' : 'class_Y' #类别名称 }, { 'character' : {'character_A':'A3', 'character_B':'B1'}, #特征向量 'class_name' : 'class_Y' #类别名称 }, { 'character' : {'character_A':'A1', 'character_B':'B3'}, #特征向量 'class_name' : 'class_Y' #类别名称 }, { 'character' : {'character_A':'A1', 'character_B':'B3'}, #特征向量 'class_name' : 'class_Y' #类别名称 }, ] input_data = { # 测试集 "character_A":"A1", "character_B":"B3" } bayes.learn(sample) #学习 print(bayes.classify(input_data)) #测试