贝叶斯-垃圾邮件
开始一点点写贝叶斯过滤垃圾邮件
之前写的,没注意把文件名对应上,可能会有些不清楚导包
代码都会上传到github
首先写代码之前好好看一下西瓜书
和机器学习实战
里面关于贝叶斯理论的介绍$p(c|x) = \frac{p(x|c)p(c)}{p(x)}$
在这里有:后验概率=先验概率*调查因子
分别解释一下,先验概率是指x发生前,对c的判断
后验概率是指x发生后,对c的重新评估
那么剩下的$\frac{p(x|c)}{p(x)}$就是调查因子,这里指的是,使预估概率更接近真实概率
有了以上,结合看机器学习实战
的4.2节应该会好点
首先进行文本分类,这里我创建一个wordVec.py
文件,代码见下
# 这个程序的目的是创建一个n*m列表,然后得到set集合,再用给定的序列对其判定,输出一个序列
# 本来这个序列是未排序的,所以多次输出结果会不同,但是如果排序之后再输出
# 那么当我们知道某个位置是什么的情况下,就可对这个序列进行判定
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
# sum = 0
# for i in postingList:
# sum += len(i)
# print(sum)
classVec = [0, 1, 0, 1, 0, 1]
return postingList, classVec
# 用一个set把数据集的内容全部打包
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return sorted(list(vocabSet))
# 对某一指定序列进行向量化输出
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word:%s is not in my vocabulary!" % word)
return returnVec
if __name__ == "__main__":
listOposts, listClasses = loadDataSet()
result = createVocabList(listOposts)
print(result)
print(len(result))
print(setOfWords2Vec(result,listOposts[0]))
这里比较容易理解,创建了一个6行的数据集,然后createVocabList
函数将其变成了一个set集合(一开始也不太熟set,这里我多练习了几次),然后用setOfwords2Vec
函数,将某一行数据转换成只有01的向量,这里我们要对输入和输出注意一下
这时候来到了4.5.2 我们要从词向量计算概率
从整体来看,也就是对每一行的词出现的次数
进行统计,然后再汇总,再除以类别数,具体来看就是侮辱性的是3行,这3行对应总共词汇是19正常的也是3行(这3行对应24,总共43个词,但是set会去掉一些重复的,所以set之后是32个词),然后stupid
这个词出现了3次,3/19≈0.15789474,也就是书上说的分类为侮辱的依据
那么我们可以在写代码的过程中多次调用print
函数来查看
这里我创建了classifier_bayes.py
文件,代码如下
from math import log
import numpy as np
import wordVec
# 传入的trainMatrix是6*32的矩阵,其中6行中每一列32个元素,出现过的位置为1,未出现则为0
# trainCategory是[0,1,0,1,0,1]
def trainNBO(trainMatrix, trainCtegory):
numTrainDocs = len(trainMatrix) # 6行
# print(numTrainDocs)
numWords = len(trainMatrix[0]) # 每行32个元素
# print(numWords)
pAbusive = sum(trainCtegory) / numTrainDocs # 3
# print(sum(trainCtegory))
# print(pAbusive)
p0Num = np.zeros(numWords) # 32个0
# p0Num = np.ones(numWords)
# print(pONum)
p1Num = np.zeros(numWords)
# p1Num = np.ones(numWords)
p0Denom = 0
# p0Denom = 2
p1Denom = 0
# p1Denom = 2
for i in range(numTrainDocs):
if trainCtegory[i] == 1:
p1Num += trainMatrix[i]
print("p0Num:", p1Num)
p1Denom += sum(trainMatrix[i])
print("p1Denom:", p1Denom)
else:
p0Num += trainMatrix[i]
print("p0Num:",p0Num)
p0Denom += sum(trainMatrix[i])
print("p0Denom:",p0Denom)
p1Vect = p1Num / p1Denom
p0Vect = p0Num / p0Denom
return p0Vect, p1Vect, pAbusive
if __name__ == "__main__":
listPosts, listClasses = wordVec.loadDataSet()
# print(listPosts)
'''
[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
'''
myVocabList = wordVec.createVocabList(listPosts)
# print(myVocabList)
'''
['not', 'to', 'ate', 'I', 'so', 'cute', 'help',
'is', 'dog', 'worthless', 'posting', 'quit', 'him', 'love', 'food',
'garbage', 'please', 'my', 'mr', 'take', 'maybe',
'has', 'stupid', 'steak', 'stop', 'buying', 'licks', 'problems',
'park', 'how', 'flea', 'dalmation']
'''
# print(len(myVocabList)) # 32
trainMat = []
for postinDoc in listPosts:
# print(postinDoc)
'''
['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid']
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him']
['stop', 'posting', 'stupid', 'worthless', 'garbage']
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him']
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
'''
trainMat.append(wordVec.setOfWords2Vec(myVocabList,postinDoc))
# print(trainMat)
'''
[[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0],
[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
'''
p0V,p1V,pAb = trainNBO(trainMat,listClasses)
print("p0V",p0V)
print("p1V",p1V)
print("pAb",pAb)
现在我们已经对侮辱(1)和非侮辱(0)的概率p0V和p1V分别进行了计算
如果我们像进行类别判断,不必把$\frac{p(x|c)p(c)}{p(x)}$所有求出来,我们只需要求$p(x|c)p(c)$,并且我们是假设独立的,那么$p(x|c)$就变成了$p(x_1|c)p(x_2|c)...*p(x_n|c)$
,并且考虑到如果其中有某一项为0,那么整个概率为0,采用取对数法进行计算
进行修改,并创建bayes.py
文件,代码见下
import numpy as np
import wordVec
# 需要的时候释放一些注释
def trainNBO(trainMatrix, trainCtegory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCtegory) / numTrainDocs
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
p0Denom = 2
p1Denom = 2
for i in range(numTrainDocs):
if trainCtegory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num / p1Denom)
p0Vect = np.log(p0Num / p0Denom)
# print(p0Vect)
# print(p1Vect)
return p0Vect, p1Vect, pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
# print(vec2Classify * p1Vec)
# print("p1:",p1)
p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
# print(vec2Classify * p0Vec)
# print("p0:",p0)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listPosts, listClasses = wordVec.loadDataSet()
myVocabList = wordVec.createVocabList(listPosts)
trainMat = []
for postinDoc in listPosts:
trainMat.append(wordVec.setOfWords2Vec(myVocabList, postinDoc))
p0V, p1V, pAb = trainNBO(trainMat, listClasses)
print(p0V)
print(p1V)
print(pAb)
testEntry = ['love', 'my', 'dalmation']
thisDoc = np.array(wordVec.setOfWords2Vec(myVocabList, testEntry))
print("%s classified as %d" % (testEntry, classifyNB(thisDoc, p0V, p1V, pAb)))
testEntry = ['stupid', 'garbage']
thisDoc = np.array(wordVec.setOfWords2Vec(myVocabList, testEntry))
print("%s classified as %d" % (testEntry, classifyNB(thisDoc, p0V, p1V, pAb)))
if __name__ == "__main__":
testingNB()
最后,用上述写好的代码,来进行一个垃圾邮件的测试
代码见下
# 现在要对邮件进行检测
import random
import bayes
import wordVec
import numpy as np
# 使用正则表达式来获取其中内容
def textParse(listString):
import re
listTokens = re.split(r'\W+',listString)
return [tok.lower() for tok in listTokens if len(tok) > 2]
def spamTest():
# 里面装每个文件的内容,也即最后列表中有50个列表
docList = []
# 标记
classList = []
# 最后装到一个列表里面
fullText = []
for i in range(1,26):
# 首先打开一个spam文件
wordList = textParse(open('email/spam/%d.txt'%i).read())
docList.append(wordList)
# print(docList)
# print(i)
fullText.extend(wordList)
classList.append(1)
# 然后打开一个ham文件
# 加上encoding='windows-1252'不会报错
wordList = textParse(open('email/ham/%d.txt'%i,encoding='windows-1252').read())
docList.append(wordList)
# print(docList)
fullText.extend(wordList)
classList.append(0)
# print(classList)
# print(fullText)
# print(docList)
# print(len(docList))
# 现在docList有50个列表,classList有25个0和25个1交替,fullText是汇总的
vocabList = wordVec.createVocabList(docList) # 去掉重复的
# print(vocabList)
trainingSet = list(range(50))
print("trainingSet:",trainingSet)
testSet = []
# 从50行中随机获得10条样本作为测试集
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
print("testSet",testSet)
del(trainingSet[randIndex])
print(len(trainingSet))
# print(testSet)
trainMat = []
trainClasses = []
for docIndex in trainingSet:
# print(docIndex)
trainMat.append(wordVec.setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
# print(trainMat)
p0V,p1V,pSpam = bayes.trainNBO(np.array(trainMat),np.array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = wordVec.setOfWords2Vec(vocabList,docList[docIndex])
if bayes.classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print(errorCount)
print("分类错误的测试集:", docList[docIndex])
print(classList)
print(errorCount)
print(len(testSet))
print("the error rate is :",errorCount/len(testSet))
spamTest()
这里可能试几次错误率都是0,多run几次试试