机器学习之朴素贝叶斯算法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | import random from numpy import ones, log, array def load_dataset(): words = [ [ "my" , "dog" , "has" , "flea" , "problems" , "help" , "please" ], [ "maybe" , "not" , "take" , "him" , "to" , "dog" , "park" , "stupid" ], [ "my" , "dalmation" , "is" , "so" , "cute" , "I" , "love" , "him" ], [ "stop" , "posting" , "stupid" , "worthless" , "garbage" ], [ "mr" , "licks" , "ate" , "my" , "steak" , "how" , "to" , "stop" , "him" ], [ "quit" , "buying" , "worthless" , "dog" , "food" , "stupid" ], ] class_list = [ 0 , 1 , 0 , 1 , 0 , 1 ] return words, class_list def create_vocab_list(dataset): vocab_set = set () for data in dataset: vocab_set | = set (data) return list (vocab_set) def vocab_list2vec(vocab_list, input_set): vec = [ 0 ] * len (vocab_list) for word in input_set: if word in vocab_list: vec[vocab_list.index(word)] + = 1 return vec def train_classify(train_mat, train_category): num_train_doc = len (train_mat) num_words = len (train_mat[ 0 ]) p_abusive = sum (train_category) / float (num_train_doc) p0_num = ones(num_words) p1_num = ones(num_words) p0 = 2.0 p1 = 2.0 for i in range (num_train_doc): if train_category[i] = = 1 : p1_num + = train_mat[i] p1 + = sum (train_mat[i]) else : p0_num + = train_mat[i] p0 + = sum (train_mat[i]) p1_vect = log(p1_num / p1) p0_vect = log(p0_num / p0) return p0_vect, p1_vect, p_abusive def classify(vec2classify, p0_vect, p1_vect, p_class): p1 = sum (vec2classify * p1_vect) + log(p_class) p0 = sum (vec2classify * p0_vect) + log( 1 - p_class) if p1 > p0: return 1 else : return 0 def testing(): words, class_list = load_dataset() vocab_list = create_vocab_list(words) train_mat = [vocab_list2vec(vocab_list, _) for _ in words] p0_v, p1_v, p_abusive = train_classify(train_mat, class_list) print (f "p0: {p0_v}\np1: {p1_v}\np_abusive: {p_abusive}" ) test_entry = [ "love" , "my" , "dalmation" ] doc = array(vocab_list2vec(vocab_list, test_entry)) print (f "test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}" ) test_entry = [ "stupid" , "garbage" ] doc = array(vocab_list2vec(vocab_list, test_entry)) print (f "test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}" ) def words_bag2vec(vocab_list, input_set): vec = [ 0 ] * len (vocab_list) for word in input_set: if word in vocab_list: vec[vocab_list.index(word)] + = 1 return vec def text_parse(string): import re word_list = re.split(r '\W+' , string) return [_.lower() for _ in word_list if len (_) > 2 ] def spam_test(): doc_list = [] class_list = [] full_text_list = [] for _ in range ( 1 , 26 ): with open (f "email/spam/{_}.txt" , "r+" ) as f: words = f.read() word_list = text_parse(words) doc_list.extend(word_list) full_text_list.extend(word_list) class_list.append( 1 ) with open (f "email/ham/{_}.txt" , "r+" ) as f: words = f.read() word_list = text_parse(words) doc_list.extend(word_list) full_text_list.extend(word_list) class_list.append( 0 ) vocab_list = create_vocab_list(doc_list) train_list = list ( range ( 50 )) test_list = [] for i in range ( 10 ): index = int (random.uniform( 0 , len (train_list))) test_list.append(train_list[index]) del train_list[index] train_mat = [] train_category = [] for i in train_list: train_mat.append(vocab_list2vec(vocab_list, doc_list[i])) train_category.append(class_list[i]) p0_v, p1_v, p_abusive = train_classify(array(train_mat), array(train_category)) error_count = 0 for i in test_list: word_vec = vocab_list2vec(vocab_list, doc_list[i]) if classify(array(word_vec), p0_v, p1_v, p_abusive) = = class_list[i]: error_count + = 1 print (f "error rate: {float(error_count) / len(test_list)}" ) if __name__ = = "__main__" : testing() spam_test() |
其他朴素贝叶斯示例或者基于主流机器学习框架实现的朴素贝叶斯代码地址:
1 | https: //gitee .com /navysummer/machine-learning/tree/master/bayes |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步