中文手机评论情感分类系列(三)

第三部分,手机各属性评论的情感分类,分类器是之前训练好的效果最好的朴素贝叶斯分类器。结果展示各属性评论的正负情感分布。

class PredictSentiment():
    def __init__(self):
        self.stopWord=[]
        self.stopWord_Path='D:/论文文件/学习文件/情感分析/dict词典/哈工大stopword .txt'#停用词路径
        with open(self.stopWord_Path, 'r', encoding='utf-8') as fr:  # 加载停用词
            for word in fr.readlines():
                self.stopWord.append(word.strip())

    def cut_word(self,sent):
        line = re.sub(r'[a-zA-Z0-9]*', '', sent)
        wordList = jieba.lcut(line, cut_all=False)
        return ' '.join([word for word in wordList if word not in self.stopWord])  # 文本分词,并且用空格连接起来,便于下面向量化

    '''将要分词的文件地址(.csv文件)输出分词后的列表'''
    def getStringList(self,file_path):
        f= pd.read_csv(file_path, sep=',', encoding='GBK')
        # 将文本分词,并且分词用空格隔开变成文本存才DataFrame中
        #print(type(f))
        f['word_list'] = f.comment.apply(self.cut_word)
        wordList = list(f.word_list)  # 必须变成列表个是才能输入下面的向量化函数
        return wordList

    '''分词文本向量化,输入训练和预测数据,进行向量化'''
    def vect(self,trainList,predictList1):
        count_vec = CountVectorizer(min_df=1)
        words_vec = count_vec.fit_transform(trainList)
        train_X = words_vec.toarray()
        pre_X = count_vec.transform(predictList1)
        return train_X, pre_X

    def naiveBayes(self,train_X,train_Y,pre_X):
        clf = MultinomialNB()
        clf.fit(train_X, train_Y)
        pre_result = clf.predict(pre_X)
        print('正向情感率', len(np.where(pre_result == 1)[0]) / len(pre_result))
        print('负向情感率', 1 - (len(np.where(pre_result == 1)[0]) / len(pre_result)))

    '''集合所有功能的集合成函数,直接输入训练数据和预测数据的地址就可以分类结果,并输出预测数据的正负情感比例'''
    def nb_classify(self,train_path,pre_path):
        train_X=self.getStringList(train_path)
        train_Y=np.array(list(pd.read_csv(train_path, sep=',', encoding='GBK').label))
        pre_X=self.getStringList(pre_path)
        train_vec_X, pre_vec_X=self.vect(train_X,pre_X)
        self.naiveBayes(train_vec_X, train_Y,pre_vec_X)


if __name__=='__main__':
    s=time.time()
    train_path = 'D:/machinelearning data/crawlerData/train_phone1.csv'
    predict_path_xiaomi = 'D:/machinelearning data/crawlerData/huaweiP20_pre_JD100.csv'
    predict_path_huawei = 'D:/machinelearning data/crawlerData/xiaomi6X_pre_JD100.csv'
    '''实例化,开始用朴素贝叶斯来分类并计算情感比例'''
    demo=PredictSentiment()
    # c=demo.getStringList(train_path)
    # #print(c)
    # demo.nb_classify(train_path,predict_path_xiaomi)

    file_name=['camera','processor','price','performance','endurance','appearance','serve']
    abs_path_xiaomi = 'D:\\machinelearning data\\crawlerData\\cluster_data\\feature_phone_xiaomi\\'
    abs_path_huawei = 'D:\\machinelearning data\\crawlerData\\cluster_data\\feature_phone_huawei\\'
    '''开始按属性遍历要计算情感比例的评论文件'''
    for i in file_name:
        print(i+'的情感分布')
        demo.nb_classify(train_path,abs_path_huawei+i+'.csv')
        #print('\n')

    e=time.time()
    print('耗时:',e-s)

  分类结果如下:

camera的情感分布
正向情感率 0.96513470681458
负向情感率 0.03486529318541998
processor的情感分布
正向情感率 0.9580152671755725
负向情感率 0.041984732824427495
price的情感分布
正向情感率 0.9159159159159159
负向情感率 0.08408408408408408
performance的情感分布
正向情感率 0.9122340425531915
负向情感率 0.08776595744680848
endurance的情感分布
正向情感率 0.9320754716981132
负向情感率 0.06792452830188678
appearance的情感分布
正向情感率 0.9781771501925546
负向情感率 0.02182284980744542
serve的情感分布
正向情感率 0.9189944134078212
负向情感率 0.08100558659217882
耗时: 139.80674481391907

  

posted @ 2018-10-11 17:48  樟樟22  阅读(742)  评论(0编辑  收藏  举报