摘要:这里用的是词袋模型,即一个词在文档中出现不止一次,每个单词可以出现多次。
1、准备数据:切分文本
前一节过滤网站恶意留言中词向量是给定的,下面介绍如何从文本文档中构建自己的词列表
先举例说明,在python提示符下输入:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | >>> mySent = 'This book is the best book on python or M.L. I have ever laid eyes upon.' >>> mySent.split() [ 'This' , 'book' , 'is' , 'the' , 'best' , 'book' , 'on' , 'python' , 'or' , 'M.L.' , 'I' , 'have' , 'ever' , 'laid' , 'eyes' , 'upon.' ] #标点符号也被当成了词的一部分,可以使用正则表达式来切分句子,其中分隔符是除单词、数字外的任意字符串 >>> import re >>> regEx = re. compile ( '\\W*' ) >>> listOfTokens = regEx.split(mySent) >>> listOfTokens [ 'This' , 'book' , 'is' , 'the' , 'best' , 'book' , 'on' , 'python' , 'or' , 'M' , 'L' , 'I' , 'have' , 'ever' , 'laid' , 'eyes' , 'upon' , ''] #去掉空字符串,通过计算每个字符串的长度,只返回长度大于0的字符串 >>> [tok for tok in listOfTokens if len (tok)> 0 ] [ 'This' , 'book' , 'is' , 'the' , 'best' , 'book' , 'on' , 'python' , 'or' , 'M' , 'L' , 'I' , 'have' , 'ever' , 'laid' , 'eyes' , 'upon' ] #将字符串全部转换成小写(.lower())或者大写(.upper()) >>> [tok.lower() for tok in listOfTokens if len (tok)> 0 ] [ 'this' , 'book' , 'is' , 'the' , 'best' , 'book' , 'on' , 'python' , 'or' , 'm' , 'l' , 'i' , 'have' , 'ever' , 'laid' , 'eyes' , 'upon' ] |
本例中共有50封电子邮件,采用的是email文件夹下的ham文件和spam文件,其中ham文件下有25份d.txt(d是1到25)文件,spam文件下也有25份d.txt(d是1到25)文件。其中的10封邮件被随机选择为测试集。以下分别是ham文件下1.txt的内容,spam文件下1.txt的内容:
创建一个bayes.py文件,添加以下代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | #!/usr/bin/python#-*- coding:utf-8 -*-#from numpy import *#创建一个包含在所有文档中出现的不重复词的列表 def createVocabList(dataSet): vocabSet = set ([]) #创建一个空集 for document in dataSet: vocabSet = vocabSet| set (document) #创建两个集合的并集 return list (vocabSet) #该函数输入参数为词汇表及其某个文档,输出是文档向量 def setOfWords2Vec(vocabList,inputSet): returnVec = [ 0 ] * len (vocabList) for word in inputSet: if word in inputSet: returnVec[vocabList.index(word)] + = 1 #这里是词袋模型,与词集模型不一样 else : print "the word:%s is not in my Vocabulary!" % word return returnVec |
2、训练算法:从词向量计算概率
前面介绍了如何将一组单词转换为一组数字,接下来看看如何使用这些数字计算概率。现在已经知道一个词是否出现在一篇文档中,也知道该文档所属类别。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | #朴素贝叶斯分类器训练函数 def trainNBO(trainMatrix,trainCategory): numTrainDocs = len (trainMatrix) numWords = len (trainMatrix[ 0 ]) pAbusive = sum (trainCategory) / float (numTrainDocs) p0Num = ones(numWords);p1Num = ones(numWords) #计算p(w0|1)p(w1|1),避免其中一个概率值为0,最后的乘积为0 p0Demo = 2.0 ;p1Demo = 2.0 #初始化概率 for i in range (numTrainDocs): if trainCategory[i] = = 1 : p1Num + = trainMatrix[i] p1Demo + = sum (trainMatrix[i]) else : p0Num + = trainMatrix[i] p0Demo + = sum (trainMatrix[i]) #p1Vect=p1Num/p1Demo #p0Vect=p0Num/p0Demo p1Vect = log(p1Num / p1Demo) #计算p(w0|1)p(w1|1)时,大部分因子都非常小,程序会下溢出或得不到正确答案(相乘许多很小数,最后四舍五入会得到0) p0Vect = log(p0Num / p0Demo) return p0Vect,p1Vect,pAbusive |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | #朴素贝叶斯分类函数 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1 = sum (vec2Classify * p1Vec) + log(pClass1) p0 = sum (vec2Classify * p0Vec) + log( 1.0 - pClass1) if p1>p0: return 1 else : return 0 #文件解析及完整的垃圾邮件测试函数 def textParse(bigString): import re listOfTokens = re.split(r '\W*' ,bigString) return [tok.lower() for tok in listOfTokens if len (tok)> 2 ] def spamTest(): docList = [];classList = [];fullText = [] for i in range ( 1 , 26 ): wordList = textParse( open ( 'email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append( 1 ) wordList = textParse( open ( 'email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append( 0 ) vocabList = createVocabList(docList) trainingSet = range ( 50 );testSet = [] #trainingSet是一个整数列表,其中的值从0到49 for i in range ( 10 ): #随机选择其中10个文件 randIndex = int (random.uniform( 0 , len (trainingSet))) testSet.append(trainingSet[randIndex]) #选择出的数字所对应的文档被添加到测试集 del (trainingSet[randIndex]) #同时被选中的数据将从训练集中踢除 trainMat = [];trainClasses = [] for docIndex in trainingSet: trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = trainNBO(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = setOfWords2Vec(vocabList,docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam)! = classList[docIndex]: errorCount + = 1 print 'the error rate is:' , float (errorCount) / len (testSet) |
下面对上述过程进行尝试,在python提示符下输入:
>>> reload(bayes) <module 'bayes' from 'bayes.py'> >>> bayes.spamTest() the error rate is: 0.1 >>> bayes.spamTest() the error rate is: 0.0
函数spamTest()会输出在10封随机选择的电子邮件上的分类错误绿。既然这些电子邮件是随机选择的,所以每次的输出结果可能有些差别。
解释:
>>> docList=[];classList=[];fullText=[] >>> for i in range(1,26): ... wordList=bayes.textParse(open('email/spam/%d.txt'%i).read()) ... docList.append(wordList) ... fullText.extend(wordList) ... classList.append(1) ... >>> wordList ['experience', 'with', 'biggerpenis', 'today', 'grow', 'inches', 'more', 'the', 'safest', 'most', 'effective', 'methods', 'of_penisen1argement', 'save', 'your', 'time', 'and', 'money', 'bettererections', 'with', 'effective', 'ma1eenhancement', 'products', 'ma1eenhancement', 'supplement', 'trusted', 'millions', 'buy', 'today'] >>> docList [['codeine', '15mg', 'for',..., 'buy', 'today']] >>> fullText ['codeine', '15mg', 'for', ...,'buy', 'today'] >>> classList [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> len(wordList) 29 >>> len(docList) 25 >>> len(fullText) 795 >>> len(classList) 25
>>> docList=[];classList=[];fullText=[] >>> for i in range(1,26): ... wordList=bayes.textParse(open('email/ham/%d.txt'%i).read()) ... docList.append(wordList) ... fullText.extend(wordList) ... classList.append(0) ... >>> wordList ['that', 'cold', 'there', 'going', 'retirement', 'party', 'are', 'the', 'leaves', 'changing', 'color'] >>> docList [['codeine', '15mg', ..., 'changing', 'color']] >>> fullText ['codeine', '15mg', ..., 'changing', 'color'] >>> classList [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] >>> len(wordList) 11 >>> len(docList) 25 >>> len(fullText) 967 >>> len(classList) 25
>>> docList=[];classList=[];fullText=[] >>> for i in range(1,26): ... wordList=bayes.textParse(open('email/spam/%d.txt'%i).read()) ... docList.append(wordList) ... fullText.extend(wordList) ... classList.append(1) ... wordList=bayes.textParse(open('email/ham/%d.txt'%i).read()) ... docList.append(wordList) ... fullText.extend(wordList) ... classList.append(0) ... >>> wordList ['that', 'cold', 'there', 'going', 'retirement', 'party', 'are', 'the', 'leaves', 'changing', 'color'] >>> docList [['codeine', '15mg', ..., 'changing', 'color']] >>> fullText ['codeine', '15mg', ..., 'changing', 'color'] >>> classList [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] >>> len(wordList) 11 >>> len(docList) 50 >>> len(fullText) 1762 >>> len(classList) 50
>>> vocabList=bayes.createVocabList(docList) >>> vocabList ['all', 'code', ..., 'others', 'once'] >>> len(vocabList) 692 >>> trainingSet=range(50);testSet=[] >>> for i in range(10): ... randIndex=int(random.uniform(0,len(trainingSet))) ... testSet.append(trainingSet[randIndex]) ... del(trainingSet[randIndex]) ... >>> trainMat=[];trainClasses=[] >>> for docIndex in trainingSet: ... trainMat.append(bayes.setOfWords2Vec(vocabList,docList[docIndex])) ... trainClasses.append(classList[docIndex]) ... >>> shape(trainMat) (40, 692) #表示40行692列,即40篇训练文档,692个不重复的词汇
>>> p0V,p1V,pSpam=bayes.trainNBO(array(trainMat),array(trainClasses)) >>> len(p0V) 692 >>> len(p1V) 692
这里一直出现的错误是将垃圾邮件误判为正常邮件,相比之下,将垃圾邮件误判为正常邮件要比正常邮件归到垃圾邮件好。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」