机器学习实战源码-----使用朴素贝叶斯进行文档分类

 1 import numpy as np
 2 
 3 def loadDataSet():
 4     postingList = [["my","dog","has","flea",
 5                     "problems","help","please"],
 6                    ["maybe","not","take","him",
 7                     "to","dog","park","stupid"],
 8                    ["my","dalmation","is","so","cute",
 9                     "I","love","him"],
10                    ["stop","posting","stupid","worthless","garbage",],
11                    ["my","licks","ate","my","steak","how",
12                     "to","stop","him"],
13                    ["qiut","buying","worthless","dog","food","stupid"]]
14     classVec = [0,1,0,1,0,1]
15     return postingList,classVec
16 
17 def createVocabList(dataSet):
18     vocabSet = set([])
19     for document in dataSet:
20         vocabSet = vocabSet | set(document)
21     return list(vocabSet)
22 
23 def setOfWords2Vec(vocabList,inputSet):
24     returnVec = [0] * len(vocabList)
25     for word in inputSet:
26         if word in vocabList:
27             returnVec[vocabList.index(word)] = 1
28         else:
29             print "the word: %s is not in my Vocabulary!" % word
30     return returnVec
31 
32 def trainNB0(trainMatrix,trainCategory):
33     numTrainDocs = len(trainMatrix)
34     numWords = len(trainMatrix[0])
35     pAbusive = sum(trainCategory) / float(numTrainDocs)
36     p0Num = np.ones(numWords);p1Num = np.ones(numWords)
37     p0Denom = 2.0;p1Denom = 2.0
38     for i in range(numTrainDocs):
39         if trainCategory[i] == 1:
40             p1Num += trainMatrix[i]
41             p1Denom += np.sum(trainMatrix[i])
42         else:
43             p0Num += trainMatrix[i]
44             p0Denom += np.sum(trainMatrix[i])
45     p1Vect = np.log(p1Num / p1Denom)
46     p0Vect = np.log(p0Num / p0Denom)
47     return p0Vect,p1Vect,pAbusive
48 
49 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
50     p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
51     p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
52     if p1 > p0:
53         return 1
54     else:
55         return 0
56 
57 def testingNB():
58     listOPosts,listClasses = loadDataSet()
59     myVocabList = createVocabList(listOPosts)
60     trainMat = []
61     for postinDoc in listOPosts:
62         trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
63     p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
64     testEntry = ["love","my","dalmation"]
65     thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
66     print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)
67     testEntry = ["stupid","garbage"]
68     thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
69     print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)
70 
71 def bagOfWords2VecMN(vocabList,inputSet):
72     returnVec = [0] * len(vocabList)
73     for word in inputSet:
74         if word in vocabList:
75             returnVec[vocabList.index(word)] += 1
76     return returnVec
77 
78 if __name__ == "__main__":
79     print testingNB()

 

posted @ 2017-10-26 16:47  __迷途的羔羊  阅读(268)  评论(0编辑  收藏  举报