基于朴素贝叶斯的书籍评价信息分类任务

import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB

# 加载数据
data = pd.read_csv("./data.csv",encoding='ansi')

print("data:\n",data)
print("data 的列名:\n",data.columns)


# 将特征值 与目标值转化为数值类型
data.loc[data.loc[:,"评价"] == '好评','评价'] = 0
data.loc[data.loc[:,"评价"] == '差评','评价'] = 1

# 将object 转化为int类型
data.loc[:,"评价"]  = data.loc[:,"评价"].astype('int')
# print(data)
# print(data.dtypes)

# 转化特征值为数值型
content_list = []
for tmp in data.loc[:,'内容 ']:
    res = jieba.cut(tmp,cut_all=False)
    # 组装分词
    res_str = ",".join(res)

    content_list.append(res_str)

print(content_list)

# 处理停用词
stop_words = []
with open("./stopwords.txt",encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line_obj = line.strip()
        stop_words.append(line_obj)
    # print(lines)
# 打印停用词结果
# print(len(stop_words))
# 去重重复的停用词
stop_words = list(set(stop_words))
# print(len(stop_words))
# 进行统计词数
con_vec = CountVectorizer(stop_words=stop_words)

# 统计分词
X = con_vec.fit_transform(content_list)

feature = X.toarray()
# 获取分词结果
names = con_vec.get_feature_names()
# print(names)
# print(feature)

# 将特征值与目标值组成完整的数据
new_data  = np.concatenate((feature,data.loc[:,'评价'].values.reshape((-1,1))),axis=1)

print(new_data)
print(new_data.shape)

# 数组的 除了最后一列 其余的都是特征值, 最后一列为目标值
# 拆分成训练集 与测试集
train_data = new_data[:10,:]
test_data = new_data[10:,:]

# 拆分特征值 与目标值
x_train = train_data[:,:-1]
y_train = train_data[:,-1]

x_test = test_data[:,:-1]
y_test = test_data[:,-1]

# 直接进行朴素贝叶斯分类
#  alpha 拉普拉斯平滑系数
nb = MultinomialNB(alpha=1.0)
#训练数据
nb.fit(x_train,y_train)

#预测数据
y_predict = nb.predict(x_test)

#准确率
score = nb.score(x_test,y_test)

print("*"*80)
print("y_predict :\n",y_predict)
print("准确率:\n",score)

 

posted @ 2019-09-20 20:00  爱学习的小猫咪  阅读(254)  评论(0编辑  收藏  举报