41、Learning for python,入门

1、贝叶斯分类算法（从文本中构建词向量）

向量的构建过程如下所示
def loadDataSet():
    postingList = [['my','dog','has','flea',\
                    'problems','help','please'],
                    ['maybe','not','take','him',\
                     'to','dog','park','stupid'],
                     ['my','dalmation','is','so','cute',\
                     'I','love','him'],
                     ['stop','posting','stupid','worthless','garbage'],
                     ['mr','licks','ate','my','steak','how',\
                     'to','stop','him'],
                     ['quit','buying','worthless','dog','food','stupid']]
    
    classVec = [0,1,0,1,0,1]  #1代表侮辱性文字，0,代表正常言论
    return postingList,classVec
    
def createVocabList(dataSet):  去掉每个数组中的重复项，并且将其放入到一个数组中
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
            
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 
        else:print ("the word: %s is not in my Vocabulary") % word
    return returnVec

在cmd的doc命令行中调用的python命令如下所示
import bayes
listOPosts,listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)
myVocabList

程序运行的结果如下所示

检查上述词表，就会发现这里不会出现重复的单词。目前该词表还没有排序，需要的话，稍后可以对其排序。

下面看一下函数setOfWords2Vec()的运行效果

该函数使用词汇表或者想要检查的所有单词作为输入，然后为其中的每一个单词构建一个特征。

一旦给定一篇文档（斑点犬网站上的一条留言），该文档就会被转换为词向量。接下来检查一下函数的有效性。myVocabList中

索引为2的元素是什么单词？，应该是单词help。该单词在第一篇文档中出现，现在检查一下看看它是否出现在第四篇文档中。

2016年11月24日

今天早上终于实现了python对于数据的分类操作

首先来看一看咱们调用python的地方是如何写的
from numpy import *
import bayes
listOPosts,listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses)

然后是被调用的脚本

import numpy
def loadDataSet():
    postingList = [['my','dog','has','flea',\
                    'problems','help','please'],
                    ['maybe','not','take','him',\
                     'to','dog','park','stupid'],
                     ['my','dalmation','is','so','cute',\
                     'I','love','him'],
                     ['stop','posting','stupid','worthless','garbage'],
                     ['mr','licks','ate','my','steak','how',\
                     'to','stop','him'],
                     ['quit','buying','worthless','dog','food','stupid']]
    
    classVec = [0,1,0,1,0,1]  #1代表侮辱性文字，0,代表正常言
    return postingList,classVec
    
def createVocabList(dataSet):      
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
            
def setOfWords2Vec(vocabList,inputSet):   #在输入词汇中判断该词，是否出现在合并后的词汇表中
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 
        else:print ("the word: %s is not in my Vocabulary") % word
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = numpy.ones(numWords)
    plNum = numpy.ones(numWords)      #
    p0Denom = 2.0;plDenom = 2.0                          #初始化概率值
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            plNum += trainMatrix[i]                        #
            plDenom += sum(trainMatrix[i])                #向量相加
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = plNum/plDenom            #change to log()    对每个元素做除法
    p0Vect = p0Num/p0Denom            #change to log()
    return p0Vect,p1Vect,pAbusive

运行结果
======================= RESTART: D:\迅雷下载\bayesTest1.py =======================
>>> pAb
0.5
>>> p0V
array([ 0.125     ,  0.        ,  0.04166667,  0.        ,  0.        ,
        0.04166667,  0.04166667,  0.04166667,  0.        ,  0.08333333,
        0.04166667,  0.        ,  0.04166667,  0.        ,  0.04166667,
        0.04166667,  0.        ,  0.04166667,  0.04166667,  0.04166667,
        0.        ,  0.        ,  0.04166667,  0.04166667,  0.04166667,
        0.04166667,  0.04166667,  0.04166667,  0.04166667,  0.04166667,
        0.        ,  0.        ])
>>> p1V
array([ 0.        ,  0.05263158,  0.        ,  0.05263158,  0.05263158,
        0.        ,  0.        ,  0.        ,  0.10526316,  0.05263158,
        0.        ,  0.05263158,  0.05263158,  0.05263158,  0.05263158,
        0.        ,  0.05263158,  0.        ,  0.        ,  0.        ,
        0.05263158,  0.05263158,  0.        ,  0.10526316,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.05263158,  0.15789474])
>>>

该函数使用词汇表或者想要检查的所有单词作为输入，然后为其中每一个单词构建一个特征。

一旦给定一篇文章，该文档就会被转换为词向量。接下来检查一下函数的有效性。

myVocabList中索引为2的元素是什么单词？，应该是单词help。该单词在第一篇文档中出现，

现在检查一下，看看它是否出现在第四篇文档中。

2、今天终于运行成功了贝叶斯分类器对于文本进行分类

def testingNB():
    listOPosts,listClasses = loadDataSet()     #加载数据集
    myVocabList = createVocabList(listOPosts)  #对不同的文本集进行合并，并且去掉重复单词
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))   #对postinDoc中的每个词汇判断，改词是否出现在合并后的词汇表中
    p0V,p1V,pAb = trainNB0(trainMat,listClasses)                 #trainMat是合并后的每个单词是否出现在合并后的词汇表中，listClasses是该整块单词是否是侮辱性单词

testEntry = ['love','my','dalmation']
    thisDoc = setOfWords2Vec(myVocabList,testEntry)
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = setOfWords2Vec(myVocabList,testEntry)
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

反正很开心

posted @ 2016-11-22 22:19 香港胖仔阅读(540) 评论(0) 编辑收藏举报

努力加载评论中...

刷新页面返回顶部

香港胖仔

41、Learning for python,入门

公告