机器学习之朴素贝叶斯算法
import random from numpy import ones, log, array def load_dataset(): words = [ ["my", "dog", "has", "flea", "problems", "help", "please"], ["maybe", "not", "take", "him", "to", "dog", "park", "stupid"], ["my", "dalmation", "is", "so", "cute", "I", "love", "him"], ["stop", "posting", "stupid", "worthless", "garbage"], ["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"], ["quit", "buying", "worthless", "dog", "food", "stupid"], ] class_list = [0, 1, 0, 1, 0, 1] return words, class_list def create_vocab_list(dataset): vocab_set = set() for data in dataset: vocab_set |= set(data) return list(vocab_set) def vocab_list2vec(vocab_list, input_set): vec = [0] * len(vocab_list) for word in input_set: if word in vocab_list: vec[vocab_list.index(word)] += 1 return vec def train_classify(train_mat, train_category): num_train_doc = len(train_mat) num_words = len(train_mat[0]) p_abusive = sum(train_category) / float(num_train_doc) p0_num = ones(num_words) p1_num = ones(num_words) p0 = 2.0 p1 = 2.0 for i in range(num_train_doc): if train_category[i] == 1: p1_num += train_mat[i] p1 += sum(train_mat[i]) else: p0_num += train_mat[i] p0 += sum(train_mat[i]) p1_vect = log(p1_num / p1) p0_vect = log(p0_num / p0) return p0_vect, p1_vect, p_abusive def classify(vec2classify, p0_vect, p1_vect, p_class): p1 = sum(vec2classify * p1_vect) + log(p_class) p0 = sum(vec2classify * p0_vect) + log(1 - p_class) if p1 > p0: return 1 else: return 0 def testing(): words, class_list = load_dataset() vocab_list = create_vocab_list(words) train_mat = [vocab_list2vec(vocab_list, _) for _ in words] p0_v, p1_v, p_abusive = train_classify(train_mat, class_list) print(f"p0: {p0_v}\np1: {p1_v}\np_abusive: {p_abusive}") test_entry = ["love", "my", "dalmation"] doc = array(vocab_list2vec(vocab_list, test_entry)) print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}") test_entry = ["stupid", "garbage"] doc = array(vocab_list2vec(vocab_list, test_entry)) print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}") def words_bag2vec(vocab_list, input_set): vec = [0] * len(vocab_list) for word in input_set: if word in vocab_list: vec[vocab_list.index(word)] += 1 return vec def text_parse(string): import re word_list = re.split(r'\W+', string) return [_.lower() for _ in word_list if len(_) > 2] def spam_test(): doc_list = [] class_list = [] full_text_list = [] for _ in range(1, 26): with open(f"email/spam/{_}.txt", "r+") as f: words = f.read() word_list = text_parse(words) doc_list.extend(word_list) full_text_list.extend(word_list) class_list.append(1) with open(f"email/ham/{_}.txt", "r+") as f: words = f.read() word_list = text_parse(words) doc_list.extend(word_list) full_text_list.extend(word_list) class_list.append(0) vocab_list = create_vocab_list(doc_list) train_list = list(range(50)) test_list = [] for i in range(10): index = int(random.uniform(0, len(train_list))) test_list.append(train_list[index]) del train_list[index] train_mat = [] train_category = [] for i in train_list: train_mat.append(vocab_list2vec(vocab_list, doc_list[i])) train_category.append(class_list[i]) p0_v, p1_v, p_abusive = train_classify(array(train_mat), array(train_category)) error_count = 0 for i in test_list: word_vec = vocab_list2vec(vocab_list, doc_list[i]) if classify(array(word_vec), p0_v, p1_v, p_abusive) == class_list[i]: error_count += 1 print(f"error rate: {float(error_count) / len(test_list)}") if __name__ == "__main__": testing() spam_test()
其他朴素贝叶斯示例或者基于主流机器学习框架实现的朴素贝叶斯代码地址:
https://gitee.com/navysummer/machine-learning/tree/master/bayes