朴素贝叶斯——算法实现
朴素贝叶斯通常用于特征的范围是离散的,即每个特征可取值的范围是离散的。给定训练集、标签,计算训练集中每个类标签的概率,再计算每个类标签下每个特征的概率。给出带预测样本的特征,计算每个类标签下的概率,选择最大的类标签作为预测类。
通常可以使用贝叶斯估计防止某个概率为0,即对于每个特征的概率分子分母都加上一项lambda,Sj*lambda,使用log函数避免概率太小而没什么区别。
过滤留言板恶意留言、过滤垃圾邮件
- 根据训练数据创建词库列表
- 根据每一个训练数据生成与训练数据对应的二进制行向量(存在为1,不存在为0,或累加)
- 根据行向量计算用于分类的概率值,类概率;每一类中各特征出现的概率;(使用log或初始为其他值保证分类的时候概率不为0)
- 对新输入进行分类,计算后验概率,选择最大的那个作为预测的类
import numpy as np
"""过滤恶意留言"""
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postingList,classVec
def createVocabList(dataSet): # mxn
vocabSet = set() #create empty set
for document in dataSet:
vocabSet = vocabSet | set(document) #union of the two sets
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
return returnVec
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1 # 同一个词出现多次
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = np.sum(trainCategory)/float(numTrainDocs)
p0Num = np.ones(numWords); p1Num = np.ones(numWords) #change to
# ones()
p0Denom = 2.0; p1Denom = 2.0 #change to 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += np.sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += np.sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom) #change to log()
p0Vect = np.log(p0Num/p0Denom) #change to log()
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1) #element-wise mult
p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
if __name__ == '__main__':
postingList, classVec = loadDataSet()
vocabSet = createVocabList(postingList)
trainMatrix =[]
for postinDoc in postingList:
trainMatrix.append(setOfWords2Vec(vocabSet, postinDoc))
p0, p1, pAb = trainNB0(trainMatrix,classVec)
t1 = ['love', 'my', 'dalmation']
t2 = ['stupid', 'garbage']
t = [t1,t2]
for ti in postingList:
t1_vec = setOfWords2Vec(vocabSet, ti)
result = classifyNB(t1_vec, p0, p1, pAb)
print(result)
import numpy as np
import re
import os
from ex04_1_bayes import *
"""过滤垃圾邮件"""
def textParse(bigString): #input is big string, #output is word list
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
path_ham = r'H:\ML\MachineLearninginAction\04bayes\email\ham'
path_spam = r'H:\ML\MachineLearninginAction\04bayes\email\spam'
ham_files = os.listdir(path_ham)
spam_files = os.listdir(path_spam)
num_of_ham = len(ham_files)
num_of_spam = len(spam_files) # 读取文件
data_set = []
labels_set = [0]*num_of_ham
labels_set.extend([1]*num_of_spam) # 设置标签
for i in range(num_of_ham):
with open(os.path.join(path_ham, ham_files[i])) as f:
data_set.append(textParse(f.read()))
for i in range(num_of_spam):
with open(os.path.join(path_spam, spam_files[i])) as f:
data_set.append(textParse(f.read())) # 提取单词到矩阵
vocab_set = createVocabList(data_set) # 建立单词库
train_mat = []
for data in data_set:
train_mat.append(setOfWords2Vec(vocab_set,data))
rand_index = np.r_[0:len(train_mat)]
np.random.shuffle(rand_index) # 打乱索引
train_mat = np.array(train_mat)
labels_set = np.array(labels_set)
train_mat_select = train_mat[rand_index[:40]]
labels_set_select = labels_set[rand_index[:40]] # 40个作为训练集
p0, p1, pAb = trainNB0(train_mat_select, labels_set_select) # 返回训练集参数
test_set_select = train_mat[rand_index[40:]]
test_labels_select = labels_set[rand_index[40:]] # 10个作为测试集
test_results = []
for train_m in test_set_select:
re = classifyNB(train_m, p0, p1, pAb)
test_results.append(re) # 分类
print(test_results)
print(test_labels_select)
compare_results = np.argwhere(np.array(test_results)==np.array(test_labels_select))
acc = 1.0*len(compare_results)/len(test_labels_select)
print(acc)
if __name__ == '__main__':
spamTest()
---
本文来自博客园,作者:Bingmous,转载请注明原文链接:https://www.cnblogs.com/bingmous/p/15643740.html