

import numpy as np

def loadDataSet():
    :return: 第一个变量是进行词条切分后的文档集合,第二个变量是一个类别标签的集合
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 人工标记 【0,stupid愚蠢,0,stupid愚蠢worthless垃圾garbage一文不值,0,worthless垃圾stupid愚蠢】   #1 is abusive, 0 not
    return postingList, classVec

def createVocabList(dataSet):
    :param dataSet:
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document)  # 并集 #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):

    :param vocabList: 词汇表
    :param inputSet: 某个文档
    :return: 词汇表长度的列表,1表示出现,0没有出现
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def trainNB0(trainMatrix, trainCategory):
    :param trainMatrix: 文档矩阵 训练集
    :param trainCategory: 文档类别标签向量
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.zeros(numWords); p1Num = np.zeros(numWords)      #change to np.ones()
    # p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to np.ones()
    p0Denom = 0.0; p1Denom = 0.0                        #change to 2.0
    # p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom          #change to np.log()
    # p1Vect = np.log(p1Num/p1Denom)          #change to np.log()
    p0Vect = p0Num/p0Denom         #change to np.log()
    # p0Vect = np.log(p0Num/p0Denom)          #change to np.log()
    return p0Vect, p1Vect, pAbusive

if __name__ == '__main__':
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)

    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0v, p1v, pAb = trainNB0(trainMat, listClasses)

['love', 'take', 'cute', 'so', 'flea', 'posting', 'stop', 'help', 'mr', 'stupid', 'ate', 'garbage', 'has', 'I', 'problems', 'licks', 'worthless', 'is', 'how', 'not', 'maybe', 'dalmation', 'food', 'buying', 'please', 'him', 'park', 'quit', 'steak', 'my', 'dog', 'to']
[[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]]
[0.04166667 0.         0.04166667 0.04166667 0.04166667 0.
 0.04166667 0.04166667 0.04166667 0.         0.04166667 0.
 0.04166667 0.04166667 0.04166667 0.04166667 0.         0.04166667
 0.04166667 0.         0.         0.04166667 0.         0.
 0.04166667 0.08333333 0.         0.         0.04166667 0.125
 0.04166667 0.04166667]
[0.         0.05263158 0.         0.         0.         0.05263158
 0.05263158 0.         0.         0.15789474 0.         0.05263158
 0.         0.         0.         0.         0.10526316 0.
 0.         0.05263158 0.05263158 0.         0.05263158 0.05263158
 0.         0.05263158 0.05263158 0.05263158 0.         0.
 0.10526316 0.05263158]

cute 在类别0中出现1次,类别1中出现0次,对应的条件概率分别是0.04166667与0.




p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
p1Vect = np.log(p1Num/p1Denom)
p0Vect = np.log(p0Num/p0Denom)


def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    :param vec2Classify:
    :param p0Vec:
    :param p1Vec:
    :param pClass1:
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
        return 0


def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
posted @ 2020-12-31 11:26  fly_bk  阅读(68)  评论(0编辑  收藏  举报