[机器学习实战] 基于概率论的分类方法：朴素贝叶斯

基于概率论的分类方法：朴素贝叶斯

(Naive Bayesian classification)

优点：在数据较少的情况下依然有效，可以处理多类别问题
缺点：对于输入数据的准备方式较为敏感
适用数据类型：标称型数据

p(ci|x,y) = p(x,y|ci)p(ci)/p(x,y)

贝叶斯分类准则：
如果p(c1|x,y) > p(c2|x,y),则属于类别c1；
如果p(c1|x,y) < p(c2|x,y),则属于类别c2；

为什么朴素(Naive)？
由统计学知，如果每个特征需要N个样本，那么1000个特征的词汇表就需要N^1000。

特征之间相互独立，因此样本数可以从N^1000减少到1000xN。
每个特征同等重要。

一、留言板文本分类

token:文本的词条，可以为单词，URL，IP地址等。

1. 准备数据：从文本中构建词向量

# 词表到向量的转换函数
def loadDataSet():
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','garbage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1] # 1代表侮辱性文字，0代表正常言论
    return postingList, classVec

def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print('the word: %s is not in my Vocabulary!') % word
    return returnVec

listOPosts, listClasses = loadDataSet()

myVocabList = createVocabList(listOPosts)
print(myVocabList)

['dalmation', 'is', 'I', 'quit', 'stop', 'park', 'dog', 'my', 'mr', 'take', 'worthless', 'garbage', 'maybe', 'cute', 'so', 'please', 'to', 'food', 'licks', 'stupid', 'him', 'help', 'flea', 'buying', 'steak', 'has', 'not', 'problems', 'posting', 'ate', 'how', 'love']

print(setOfWords2Vec(myVocabList, listOPosts[0]))

[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]

2. 训练算法：从词向量计算概率

# 从词向量计算概率
import numpy as np

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i] # c1类中每个单词出现的次数和
            p1Denom += sum(trainMatrix[i]) # c1类中所有出现的单词和
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom # change to log() matrix:1*numWords 每个字条的p(wi|c1)
    p0Vect = p0Num/p0Denom # change to log() matrix:1*numWords 每个字条的p(wi|c0)
    return p0Vect, p1Vect, pAbusive

listOPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)

trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))

print(trainMat[:3])

[[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

p0V, p1V, pAb = trainNB0(trainMat, listClasses)

pAb

0.5

p0V

array([ 0.04166667,  0.04166667,  0.04166667,  0.        ,  0.04166667,
        0.        ,  0.04166667,  0.125     ,  0.04166667,  0.        ,
        0.        ,  0.        ,  0.        ,  0.04166667,  0.04166667,
        0.04166667,  0.04166667,  0.        ,  0.04166667,  0.        ,
        0.08333333,  0.04166667,  0.04166667,  0.        ,  0.04166667,
        0.04166667,  0.        ,  0.04166667,  0.        ,  0.04166667,
        0.04166667,  0.04166667])

p1V

array([ 0.        ,  0.        ,  0.        ,  0.05263158,  0.05263158,
        0.05263158,  0.10526316,  0.        ,  0.        ,  0.05263158,
        0.10526316,  0.05263158,  0.05263158,  0.        ,  0.        ,
        0.        ,  0.05263158,  0.05263158,  0.        ,  0.15789474,
        0.05263158,  0.        ,  0.        ,  0.05263158,  0.        ,
        0.        ,  0.05263158,  0.        ,  0.05263158,  0.        ,
        0.        ,  0.        ])

3. 测试算法：根据现实情况修改分类器

计算p(w0|1)p(w1|1)p(w2|1)...时，当其中一个概率值为0时，那么最后的乘积也为0。为了降低这种影响，可以将所有词的出现数初始化为1，并将分母初始化为2。
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
下溢出，这是由于太多很小的数相乘造成的(程序四舍五入均为0)。解决方法就是对乘积取自然对数。
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)

# 修改后的概率计算函数
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i] # c1类中每个单词出现的次数和
            p1Denom += sum(trainMatrix[i]) # c1类中所有出现的单词和
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom) # matrix:1*numWords 每个字条的p(wi|c1)
    p0Vect = np.log(p0Num/p0Denom) # matrix:1*numWords 每个字条的p(wi|c0)
    return p0Vect, p1Vect, pAbusive

# 朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) # ln(a*b) = ln(a) + ln(b)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = [] # 样本转换为词向量后组成的矩阵 size: len(listOPosts)*len(myVocabList)
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))

testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1

4. 准备数据：文档词袋模型

词集模型：
每个词的出现与否作为一个特征，函数setOfWords2Vec()
词袋模型：
每个单词出现多次会增加词向量的对应值，函数bagOfWords2VecMN()

# 朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
        return returnVec

二、使用朴素贝叶斯过滤垃圾邮件

1. 准备数据：切分文本

mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
print(mySent.split())

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']

import re
regEx = re.compile('\\W*') # 除单词、数字以外的任意字符串
listOfTokens = regEx.split(mySent)
listOfTokens = [tok.lower() for tok in listOfTokens if len(tok) > 0]
print(listOfTokens)

['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']


C:\Users\Day\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: split() requires a non-empty pattern match.
  This is separate from the ipykernel package so we can avoid doing imports until

emailText = open('data/email/ham/6.txt').read()
listOfTokens = regEx.split(emailText)
print(listOfTokens)

['Hello', 'Since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'Google', 'Groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message', 'pages', 'or', 'files', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'February', '2011', 'We', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'Google', 'Groups', 'mailing', 'lists', 'and', 'forum', 'discussions', 'Instead', 'of', 'these', 'features', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation', 'such', 'as', 'Google', 'Docs', 'and', 'Google', 'Sites', 'For', 'example', 'you', 'can', 'easily', 'create', 'your', 'pages', 'on', 'Google', 'Sites', 'and', 'share', 'the', 'site', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '174623', 'with', 'the', 'members', 'of', 'your', 'group', 'You', 'can', 'also', 'store', 'your', 'files', 'on', 'the', 'site', 'by', 'attaching', 'files', 'to', 'pages', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '90563', 'on', 'the', 'site', 'If', 'you抮e', 'just', 'looking', 'for', 'a', 'place', 'to', 'upload', 'your', 'files', 'so', 'that', 'your', 'group', 'members', 'can', 'download', 'them', 'we', 'suggest', 'you', 'try', 'Google', 'Docs', 'You', 'can', 'upload', 'files', 'http', 'docs', 'google', 'com', 'support', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '50092', 'and', 'share', 'access', 'with', 'either', 'a', 'group', 'http', 'docs', 'google', 'com', 'support', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '66343', 'or', 'an', 'individual', 'http', 'docs', 'google', 'com', 'support', 'bin', 'answer', 'py', 'hl', 'en', 'answer', '86152', 'assigning', 'either', 'edit', 'or', 'download', 'only', 'access', 'to', 'the', 'files', 'you', 'have', 'received', 'this', 'mandatory', 'email', 'service', 'announcement', 'to', 'update', 'you', 'about', 'important', 'changes', 'to', 'Google', 'Groups', '']


C:\Users\Day\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: split() requires a non-empty pattern match.

2. 测试算法：使用朴素贝叶斯进行交叉验证

# 文本解析及完整的垃圾邮件测试函数
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26): # i = [1,2,3...,24,25] 共50封邮件
        wordList = textParse(open('data/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('data/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList) # 词汇表
    trainingSet = list(range(50)) # [0,1,2,...48,49] 转换为list以防del(trainingSet[randIndex])时出错
    testSet = []
    # 留存交叉验证（hold-out cross validation）
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) # 计算分类所需概率p0V,p1V,pSpam
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
        print('the error rate is: ', float(errorCount)/len(testSet))

spamTest()

the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0
the error rate is:  0.0


C:\Users\Day\Anaconda3\lib\re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)

三、使用朴素贝叶斯分类器从个人广告中获取区域倾向

1. 收集数据：导入RSS源

安装feedparse

从code.google.com/p/feedparser/上下载
切换到解压文件夹下
python setup.py install

import feedparser

ny = feedparser.parse('http://newyork.craigslist.org/stp/index/rss')

尚未解决的rss订阅源问题，目前来看ny['entries']中得到的是html网址，打开后才是正文，或许需要解析网址提取文本的操作，以后再进行。

# RSS源分类器及高频词去除函数
def calcMostFreq(vocabList, fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
    return sortedFreq[:30]

除了去掉高频词，还可以从停用词表（stop word list）中移除

def localWords(feed1, feed0):
    import feedparser
    docList = []; classList = []; fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = range(2*minLen); testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0.0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
    return vocabList, p0V, p1V

ny = feedparser.parse('http://newyork.craigslist.org/stp/index/rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index/rss')
#vocabList, pSF, pNY = localWords(ny, sf)

尚未处理还RSS源文本问题，暂不执行

这里得到的错误率远高于垃圾邮件中的错误率，但这里关注的是单词概率而不是实际分类，可以通过函数calMostFreq()改变移除的单词数目来观察错误率。

2. 分析数据：显示地域相关的用词

# 最具表征性的词汇显示函数
def getTopWords(ny, sf):
    import operator
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []; topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0: topSF.append((vocabList[i], p0V[i]))
        if p1V[i] > -6.0: topNY.append((vocabList[i], p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print('SF**SF**SF**SF**SF**SF')
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print('NY**NY**NY**NY**NY**NY')
    for item in sortedNY:
        print(item[0])

#getTopWords(ny, sf)

小结：

对于分类而言，使用概率有时要比使用硬规则更为有效。贝叶斯概率及准则提供了一种利用已知值估计未知概率的方法。
可以通过特征之间的条件独立性假设，降低对数据量的需求。
下溢出的问题可以通过对概率取对数来解决。

posted @ 2017-06-29 20:16 戴戴Day 阅读(544) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

戴戴Day

什么都无法舍弃的人，什么也得不到。