机器学习之朴素贝叶斯算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import random
 
from numpy import ones, log, array
 
 
def load_dataset():
    words = [
        ["my", "dog", "has", "flea",
         "problems", "help", "please"],
        ["maybe", "not", "take", "him", "to",
         "dog", "park", "stupid"],
        ["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
        ["stop", "posting", "stupid", "worthless", "garbage"],
        ["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
        ["quit", "buying", "worthless", "dog", "food", "stupid"],
    ]
    class_list = [0, 1, 0, 1, 0, 1]
    return words, class_list
 
 
def create_vocab_list(dataset):
    vocab_set = set()
    for data in dataset:
        vocab_set |= set(data)
    return list(vocab_set)
 
 
def vocab_list2vec(vocab_list, input_set):
    vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            vec[vocab_list.index(word)] += 1
    return vec
 
 
def train_classify(train_mat, train_category):
    num_train_doc = len(train_mat)
    num_words = len(train_mat[0])
    p_abusive = sum(train_category) / float(num_train_doc)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0 = 2.0
    p1 = 2.0
    for i in range(num_train_doc):
        if train_category[i] == 1:
            p1_num += train_mat[i]
            p1 += sum(train_mat[i])
        else:
            p0_num += train_mat[i]
            p0 += sum(train_mat[i])
    p1_vect = log(p1_num / p1)
    p0_vect = log(p0_num / p0)
    return p0_vect, p1_vect, p_abusive
 
 
def classify(vec2classify, p0_vect, p1_vect, p_class):
    p1 = sum(vec2classify * p1_vect) + log(p_class)
    p0 = sum(vec2classify * p0_vect) + log(1 - p_class)
    if p1 > p0:
        return 1
    else:
        return 0
 
 
def testing():
    words, class_list = load_dataset()
    vocab_list = create_vocab_list(words)
    train_mat = [vocab_list2vec(vocab_list, _) for _ in words]
    p0_v, p1_v, p_abusive = train_classify(train_mat, class_list)
    print(f"p0: {p0_v}\np1: {p1_v}\np_abusive: {p_abusive}")
    test_entry = ["love", "my", "dalmation"]
    doc = array(vocab_list2vec(vocab_list, test_entry))
    print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
    test_entry = ["stupid", "garbage"]
    doc = array(vocab_list2vec(vocab_list, test_entry))
    print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
 
 
def words_bag2vec(vocab_list, input_set):
    vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            vec[vocab_list.index(word)] += 1
    return vec
 
 
def text_parse(string):
    import re
    word_list = re.split(r'\W+', string)
    return [_.lower() for _ in word_list if len(_) > 2]
 
 
def spam_test():
    doc_list = []
    class_list = []
    full_text_list = []
    for _ in range(1, 26):
        with open(f"email/spam/{_}.txt", "r+") as f:
            words = f.read()
        word_list = text_parse(words)
        doc_list.extend(word_list)
        full_text_list.extend(word_list)
        class_list.append(1)
        with open(f"email/ham/{_}.txt", "r+") as f:
            words = f.read()
        word_list = text_parse(words)
        doc_list.extend(word_list)
        full_text_list.extend(word_list)
        class_list.append(0)
    vocab_list = create_vocab_list(doc_list)
    train_list = list(range(50))
    test_list = []
    for i in range(10):
        index = int(random.uniform(0, len(train_list)))
        test_list.append(train_list[index])
        del train_list[index]
    train_mat = []
    train_category = []
    for i in train_list:
        train_mat.append(vocab_list2vec(vocab_list, doc_list[i]))
        train_category.append(class_list[i])
    p0_v, p1_v, p_abusive = train_classify(array(train_mat), array(train_category))
    error_count = 0
    for i in test_list:
        word_vec = vocab_list2vec(vocab_list, doc_list[i])
        if classify(array(word_vec), p0_v, p1_v, p_abusive) == class_list[i]:
            error_count += 1
    print(f"error rate: {float(error_count) / len(test_list)}")
 
 
if __name__ == "__main__":
    testing()
    spam_test()

其他朴素贝叶斯示例或者基于主流机器学习框架实现的朴素贝叶斯代码地址:

1
https://gitee.com/navysummer/machine-learning/tree/master/bayes

  

 

posted @   NAVYSUMMER  阅读(9)  评论(0编辑  收藏  举报
交流群 编程书籍
点击右上角即可分享
微信分享提示