【机器学习】朴素贝叶斯-01

心得体会

  1.计算每一个特征值在不同结果占比,例特征值c在结果A的所有特征的“占比”rA ,在结果B的所有特征“占比”rB,rB>rA使说明B结果可能性大

  2.得到一条数据,将该数据转换成特征向量,计算所有特征值生成的不同结果的概率,选择其中概率最大的结果,例PA>PB,则该数据结果为PA

  3.通过对特征值的加权(通过占比得到的向量默认每个特征值的权重相等),使结果更加准确

  4.防止概率结果下溢的方法:使用log(a*b)=log(a)+log(b)的方式使乘法变成加法,代替存储概率

  5.防止特征值空缺导致结果为0的方法:默认每个特征值初始化为1/(该特征值的域)

from numpy import *
#4-5使用贝叶斯进行文本分类
#4-5-1准备数据——从文本中构建词向量
#词表向向量的转换函数
def loadDataSet():
    postingList=[
        ['my','dog','has','flea','probblems','help','please'],
        ['maybe','not','take','him','to','dog','park','stupid'],
        ['my','dalmation','is','so','cute','I','love','him'],
        ['stop','posting','stupid','worthless','garbage'],
        ['mr','licks','ate','my','steak','how','to','stop','him'],
        ['quit','buying','worthless','dog','food','stupid']
    ]
    classVec=[0,1,0,1,0,1]
    return postingList,classVec

#定义单词向量头
def createVocabList(dataSet):
    vocabSet=set([])
    for document in dataSet:
        vocabSet=vocabSet | set(document) #set相互与运算
    return list(vocabSet)

#用句子创建单词向量
def setOfWords2Vec(vocabList,inputSet):#每个词出现一次
    returnVec=[0]*len(vocabList)    #创建空数组
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
        else: print("the word %s is not in my vacbulary"%word)
    return returnVec

def setOfWords2VecMN(vocabList,inputSet):#每个词出现多次
    returnVec=[0]*len(vocabList)    #创建空数组
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]+=1
        else: print("the word %s is not in my vacbulary"%word)
    return returnVec

# listOPosts,listClass=loadDataSet()
# myVocabList=createVocabList(listOPosts)
# print(myVocabList)
# print(setOfWords2Vec(myVocabList,listOPosts[0]))

# 朴素贝叶斯分类器训练函数
def trainNBO(trainMatrix,trainCategory):#单词向量矩阵,单词标签列表
    numTrainDocs=len(trainMatrix)#用来训练的数据数
    numWords=len(trainMatrix[0])#单词向量的单词数
    pAbusive=sum(trainCategory)/float(numTrainDocs)#侮辱语句在训练的句子中的占比
    p0Num=ones(numWords)#为了不会有p=0的状态产生,使初始都是p=1/2
    p1Num=ones(numWords)
    p0Denom=2.0#为了不会有p=0的状态产生,使初始都是p=1/2
    p1Denom=2.0
    for i in range(numTrainDocs):
        if trainCategory[i]==1:#如果是侮辱性语言
            p1Num+=trainMatrix[i]#侮辱性语言含有的单词
            p1Denom+=sum(trainMatrix[i])#第一行的单词类数
        else:
            p0Num+=trainMatrix[i]#非侮辱性语言含有的单词
            p0Denom+=sum(trainMatrix[i])##非侮辱性语言含有的单词类数
    p1Vect=log(p1Num/p1Denom)    #该单词在侮辱性语言中出现的概率
    p0Vect=log(p0Num/p0Denom)    #该单词在非侮辱性语言中出现的概率(使用log防止过小下溢。log(a*b)=loga+logb)
    return p0Vect,p1Vect,pAbusive

# listOPosts,listClass=loadDataSet()
# myVocabList=createVocabList(listOPosts)
# trainMat=[]
# for postinDoc in listOPosts:
#     trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
# p0V,p1V,pAb=trainNBO(trainMat,listClass)

#朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1=sum(vec2Classify*p1Vec)+log(pClass1)
    p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts,listClasses=loadDataSet()
    myVocabList=createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb=trainNBO(array(trainMat),array(listClasses))
    testEntry=['love','my','dalmation']
    thisDoc=array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, "classified as :", classifyNB(thisDoc, p0V, p1V, pAb))

 

posted @ 2020-07-09 11:22  海底淤泥  阅读(189)  评论(0编辑  收藏  举报