【机器学习】朴素贝叶斯-01
心得体会
1.计算每一个特征值在不同结果占比,例特征值c在结果A的所有特征的“占比”rA ,在结果B的所有特征“占比”rB,rB>rA使说明B结果可能性大
2.得到一条数据,将该数据转换成特征向量,计算所有特征值生成的不同结果的概率,选择其中概率最大的结果,例PA>PB,则该数据结果为PA
3.通过对特征值的加权(通过占比得到的向量默认每个特征值的权重相等),使结果更加准确
4.防止概率结果下溢的方法:使用log(a*b)=log(a)+log(b)的方式使乘法变成加法,代替存储概率
5.防止特征值空缺导致结果为0的方法:默认每个特征值初始化为1/(该特征值的域)
from numpy import * #4-5使用贝叶斯进行文本分类 #4-5-1准备数据——从文本中构建词向量 #词表向向量的转换函数 def loadDataSet(): postingList=[ ['my','dog','has','flea','probblems','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','stupid'] ] classVec=[0,1,0,1,0,1] return postingList,classVec #定义单词向量头 def createVocabList(dataSet): vocabSet=set([]) for document in dataSet: vocabSet=vocabSet | set(document) #set相互与运算 return list(vocabSet) #用句子创建单词向量 def setOfWords2Vec(vocabList,inputSet):#每个词出现一次 returnVec=[0]*len(vocabList) #创建空数组 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]=1 else: print("the word %s is not in my vacbulary"%word) return returnVec def setOfWords2VecMN(vocabList,inputSet):#每个词出现多次 returnVec=[0]*len(vocabList) #创建空数组 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 else: print("the word %s is not in my vacbulary"%word) return returnVec # listOPosts,listClass=loadDataSet() # myVocabList=createVocabList(listOPosts) # print(myVocabList) # print(setOfWords2Vec(myVocabList,listOPosts[0])) # 朴素贝叶斯分类器训练函数 def trainNBO(trainMatrix,trainCategory):#单词向量矩阵,单词标签列表 numTrainDocs=len(trainMatrix)#用来训练的数据数 numWords=len(trainMatrix[0])#单词向量的单词数 pAbusive=sum(trainCategory)/float(numTrainDocs)#侮辱语句在训练的句子中的占比 p0Num=ones(numWords)#为了不会有p=0的状态产生,使初始都是p=1/2 p1Num=ones(numWords) p0Denom=2.0#为了不会有p=0的状态产生,使初始都是p=1/2 p1Denom=2.0 for i in range(numTrainDocs): if trainCategory[i]==1:#如果是侮辱性语言 p1Num+=trainMatrix[i]#侮辱性语言含有的单词 p1Denom+=sum(trainMatrix[i])#第一行的单词类数 else: p0Num+=trainMatrix[i]#非侮辱性语言含有的单词 p0Denom+=sum(trainMatrix[i])##非侮辱性语言含有的单词类数 p1Vect=log(p1Num/p1Denom) #该单词在侮辱性语言中出现的概率 p0Vect=log(p0Num/p0Denom) #该单词在非侮辱性语言中出现的概率(使用log防止过小下溢。log(a*b)=loga+logb) return p0Vect,p1Vect,pAbusive # listOPosts,listClass=loadDataSet() # myVocabList=createVocabList(listOPosts) # trainMat=[] # for postinDoc in listOPosts: # trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) # p0V,p1V,pAb=trainNBO(trainMat,listClass) #朴素贝叶斯分类函数 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1=sum(vec2Classify*p1Vec)+log(pClass1) p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1) if p1>p0: return 1 else: return 0 def testingNB(): listOPosts,listClasses=loadDataSet() myVocabList=createVocabList(listOPosts) trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb=trainNBO(array(trainMat),array(listClasses)) testEntry=['love','my','dalmation'] thisDoc=array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)) testEntry = ['stupid','garbage'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, "classified as :", classifyNB(thisDoc, p0V, p1V, pAb))