机器学习算法学习---处理分类问题常用算法(四)

1、基于贝叶斯决策理论的分类方法(核心思想:选择具有最高概率的决策)

朴素贝叶斯

优点:在数据较少的情况下仍然有效,可以处理多类别问题。

缺点:对于输入数据的准备方式较为敏感。

适用类型:标称型数据。

2、条件概率

p(A|B)=p(A and B)/p(B)

贝叶斯准则:p(c|x)=p(x|c)p(c)/p(x)

使用条件概率来分类:

p(ci|x,y)=p(x,y|ci)p(ci)/p(x,y)

若P(c1|x,y)>P(c2|x,y),那么属于类别c1

 

若P(c2|x,y)>P(c1|x,y),那么属于类别c2
3、使用朴素贝叶斯进行文档分类

 

两个假设:(1)特征之间相互独立;(2)每个特征同等重要。

使用python实现文本分类:

 

Bayes

 

import operator
from numpy import *
from math import log
def loadDataset():#创建一些实验样本。返回进行词条切分后的文档集合、类别标签集合。
postingList=[['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec=[0,1,0,1,0,1]#1代表侮辱性文字,0代表正常言论
return postingList,classVec
def createVocabList(Dataset):#创建词汇表
vocabset=set([])
for document in Dataset:
vocabset=vocabset|set(document)
return list(vocabset)
def setOfWords2Vec(vocablist,inputSet):#输入词汇表及某个文档。返回和词汇表等长的文档向量。(词集模型:每个词是否出现)
returnVec=[0]*len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)]=1
else:
print('the word %s is not in my vocabulary' % word)
return returnVec
def bagOfWords2Vec(vocablist,inputSet):#词袋模型:记录了词出现的次数
returnVec=[0]*len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)]+=1
else:
print('the word %s is not in my vocabulary' % word)
return returnVec
def getAllwordsVec(vocablist,Dataset):#获得文档的所有词向量,返回为矩阵
trainMat=[]
for document in Dataset:
trainMat.append(setOfWords2Vec(vocablist,document))
return trainMat
#p(ci|w)=p(w|ci)p(ci)/p(w)根据条件独立性假设:p(w0,w1,w2...wn|ci)=p(w0|ci)p(w1|ci)p(w2|ci)...p(wn|ci)
def trainbayes(trainMat,trainlabels):#分类器训练函数,以二分类为例
numtrainDucs=len(trainMat)
numwords=len(trainMat[0])
pAbusive=sum(trainlabels)/float(numtrainDucs)#计算文档属于类别1的概率,因为是二分类问题,p(0)=1-p(1)
p0num=ones(numwords)#避免乘积为0
p1num=ones(numwords)
p0Denom=2.0
p1Denom=2.0
for i in range(numtrainDucs):
if trainlabels[i]==1:
p1num+=trainMat[i]#numpy数组相加
p1Denom+=sum(trainMat[i])
else:
p0num+=trainMat[i]
p0Denom+=sum(trainMat[i])
p1Vect=p1num/p1Denom#防止下溢(太多很小的数相乘造成,四舍五入会得到0)出加log
p0Vect=p0num/p0Denom
return p0Vect,p1Vect,pAbusive
def classify(testvec,p0vec,p1vec,pClass1):#分类函数
p1=sum(testvec*p1vec)+log(pClass1)#log(p(w|c1)p(c1))
p0=sum(testvec*p0vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testbayes():#测试函数
listOposts,listClasses=loadDataset()
myVocabList=createVocabList(listOposts)
trainMat=getAllwordsVec(myVocabList,listOposts)
p0v,p1v,pAb=trainbayes(trainMat,listClasses)
testEntry=['love','my','dalmation']
testVec=array(setOfWords2Vec(myVocabList,testEntry))
print(classify(testVec,p0v,p1v,pAb))
testEntry=['stupid','garbage']
testVec=array(setOfWords2Vec(myVocabList,testEntry))
print(classify(testVec,p0v,p1v,pAb))
if __name__=='__main__':
testbayes()

 

posted @ 2019-05-05 16:25  2048的渣渣  阅读(570)  评论(0编辑  收藏  举报