机器学习之朴素贝叶斯算法

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

import random
 
from numpy import ones, log, array
 
 
def load_dataset():
    words = [
        ["my", "dog", "has", "flea",
         "problems", "help", "please"],
        ["maybe", "not", "take", "him", "to",
         "dog", "park", "stupid"],
        ["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
        ["stop", "posting", "stupid", "worthless", "garbage"],
        ["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
        ["quit", "buying", "worthless", "dog", "food", "stupid"],
    ]
    class_list = [0, 1, 0, 1, 0, 1]
    return words, class_list
 
 
def create_vocab_list(dataset):
    vocab_set = set()
    for data in dataset:
        vocab_set |= set(data)
    return list(vocab_set)
 
 
def vocab_list2vec(vocab_list, input_set):
    vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            vec[vocab_list.index(word)] += 1
    return vec
 
 
def train_classify(train_mat, train_category):
    num_train_doc = len(train_mat)
    num_words = len(train_mat[0])
    p_abusive = sum(train_category) / float(num_train_doc)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0 = 2.0
    p1 = 2.0
    for i in range(num_train_doc):
        if train_category[i] == 1:
            p1_num += train_mat[i]
            p1 += sum(train_mat[i])
        else:
            p0_num += train_mat[i]
            p0 += sum(train_mat[i])
    p1_vect = log(p1_num / p1)
    p0_vect = log(p0_num / p0)
    return p0_vect, p1_vect, p_abusive
 
 
def classify(vec2classify, p0_vect, p1_vect, p_class):
    p1 = sum(vec2classify * p1_vect) + log(p_class)
    p0 = sum(vec2classify * p0_vect) + log(1 - p_class)
    if p1 > p0:
        return 1
    else:
        return 0
 
 
def testing():
    words, class_list = load_dataset()
    vocab_list = create_vocab_list(words)
    train_mat = [vocab_list2vec(vocab_list, _) for _ in words]
    p0_v, p1_v, p_abusive = train_classify(train_mat, class_list)
    print(f"p0: {p0_v}\np1: {p1_v}\np_abusive: {p_abusive}")
    test_entry = ["love", "my", "dalmation"]
    doc = array(vocab_list2vec(vocab_list, test_entry))
    print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
    test_entry = ["stupid", "garbage"]
    doc = array(vocab_list2vec(vocab_list, test_entry))
    print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
 
 
def words_bag2vec(vocab_list, input_set):
    vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            vec[vocab_list.index(word)] += 1
    return vec
 
 
def text_parse(string):
    import re
    word_list = re.split(r'\W+', string)
    return [_.lower() for _ in word_list if len(_) > 2]
 
 
def spam_test():
    doc_list = []
    class_list = []
    full_text_list = []
    for _ in range(1, 26):
        with open(f"email/spam/{_}.txt", "r+") as f:
            words = f.read()
        word_list = text_parse(words)
        doc_list.extend(word_list)
        full_text_list.extend(word_list)
        class_list.append(1)
        with open(f"email/ham/{_}.txt", "r+") as f:
            words = f.read()
        word_list = text_parse(words)
        doc_list.extend(word_list)
        full_text_list.extend(word_list)
        class_list.append(0)
    vocab_list = create_vocab_list(doc_list)
    train_list = list(range(50))
    test_list = []
    for i in range(10):
        index = int(random.uniform(0, len(train_list)))
        test_list.append(train_list[index])
        del train_list[index]
    train_mat = []
    train_category = []
    for i in train_list:
        train_mat.append(vocab_list2vec(vocab_list, doc_list[i]))
        train_category.append(class_list[i])
    p0_v, p1_v, p_abusive = train_classify(array(train_mat), array(train_category))
    error_count = 0
    for i in test_list:
        word_vec = vocab_list2vec(vocab_list, doc_list[i])
        if classify(array(word_vec), p0_v, p1_v, p_abusive) == class_list[i]:
            error_count += 1
    print(f"error rate: {float(error_count) / len(test_list)}")
 
 
if __name__ == "__main__":
    testing()
    spam_test()

其他朴素贝叶斯示例或者基于主流机器学习框架实现的朴素贝叶斯代码地址：

1	`https://gitee.com/navysummer/machine-learning/tree/master/bayes`

posted @ 2024-06-29 19:12 NAVYSUMMER 阅读(9) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

欢迎阅读『机器学习之朴素贝叶斯算法』

昵称： NAVYSUMMER
园龄： 8年6个月
粉丝： 38
关注： 2

+加关注

2025年2月

日

一

二

三

四

五

六

NAVYSUMMER

机器学习之朴素贝叶斯算法

公告

搜索

常用链接

最新随笔

我的标签

积分与排名

合集

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论