机器学习之朴素贝叶斯算法

import random

from numpy import ones, log, array


def load_dataset():
    words = [
        ["my", "dog", "has", "flea",
         "problems", "help", "please"],
        ["maybe", "not", "take", "him", "to",
         "dog", "park", "stupid"],
        ["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
        ["stop", "posting", "stupid", "worthless", "garbage"],
        ["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
        ["quit", "buying", "worthless", "dog", "food", "stupid"],
    ]
    class_list = [0, 1, 0, 1, 0, 1]
    return words, class_list


def create_vocab_list(dataset):
    vocab_set = set()
    for data in dataset:
        vocab_set |= set(data)
    return list(vocab_set)


def vocab_list2vec(vocab_list, input_set):
    vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            vec[vocab_list.index(word)] += 1
    return vec


def train_classify(train_mat, train_category):
    num_train_doc = len(train_mat)
    num_words = len(train_mat[0])
    p_abusive = sum(train_category) / float(num_train_doc)
    p0_num = ones(num_words)
    p1_num = ones(num_words)
    p0 = 2.0
    p1 = 2.0
    for i in range(num_train_doc):
        if train_category[i] == 1:
            p1_num += train_mat[i]
            p1 += sum(train_mat[i])
        else:
            p0_num += train_mat[i]
            p0 += sum(train_mat[i])
    p1_vect = log(p1_num / p1)
    p0_vect = log(p0_num / p0)
    return p0_vect, p1_vect, p_abusive


def classify(vec2classify, p0_vect, p1_vect, p_class):
    p1 = sum(vec2classify * p1_vect) + log(p_class)
    p0 = sum(vec2classify * p0_vect) + log(1 - p_class)
    if p1 > p0:
        return 1
    else:
        return 0


def testing():
    words, class_list = load_dataset()
    vocab_list = create_vocab_list(words)
    train_mat = [vocab_list2vec(vocab_list, _) for _ in words]
    p0_v, p1_v, p_abusive = train_classify(train_mat, class_list)
    print(f"p0: {p0_v}\np1: {p1_v}\np_abusive: {p_abusive}")
    test_entry = ["love", "my", "dalmation"]
    doc = array(vocab_list2vec(vocab_list, test_entry))
    print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
    test_entry = ["stupid", "garbage"]
    doc = array(vocab_list2vec(vocab_list, test_entry))
    print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")


def words_bag2vec(vocab_list, input_set):
    vec = [0] * len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            vec[vocab_list.index(word)] += 1
    return vec


def text_parse(string):
    import re
    word_list = re.split(r'\W+', string)
    return [_.lower() for _ in word_list if len(_) > 2]


def spam_test():
    doc_list = []
    class_list = []
    full_text_list = []
    for _ in range(1, 26):
        with open(f"email/spam/{_}.txt", "r+") as f:
            words = f.read()
        word_list = text_parse(words)
        doc_list.extend(word_list)
        full_text_list.extend(word_list)
        class_list.append(1)
        with open(f"email/ham/{_}.txt", "r+") as f:
            words = f.read()
        word_list = text_parse(words)
        doc_list.extend(word_list)
        full_text_list.extend(word_list)
        class_list.append(0)
    vocab_list = create_vocab_list(doc_list)
    train_list = list(range(50))
    test_list = []
    for i in range(10):
        index = int(random.uniform(0, len(train_list)))
        test_list.append(train_list[index])
        del train_list[index]
    train_mat = []
    train_category = []
    for i in train_list:
        train_mat.append(vocab_list2vec(vocab_list, doc_list[i]))
        train_category.append(class_list[i])
    p0_v, p1_v, p_abusive = train_classify(array(train_mat), array(train_category))
    error_count = 0
    for i in test_list:
        word_vec = vocab_list2vec(vocab_list, doc_list[i])
        if classify(array(word_vec), p0_v, p1_v, p_abusive) == class_list[i]:
            error_count += 1
    print(f"error rate: {float(error_count) / len(test_list)}")


if __name__ == "__main__":
    testing()
    spam_test()

其他朴素贝叶斯示例或者基于主流机器学习框架实现的朴素贝叶斯代码地址:

https://gitee.com/navysummer/machine-learning/tree/master/bayes

  

 

posted @ 2024-06-29 19:12  NAVYSUMMER  阅读(5)  评论(0编辑  收藏  举报
交流群 编程书籍