1 import pandas as pd
 2 from sklearn.feature_extraction.text import CountVectorizer
 3 import jieba
 4 import numpy as np
 5 from sklearn.naive_bayes import MultinomialNB
 6 
 7 # 1、加载数据
 8 data = pd.read_csv("./data.csv", encoding="ansi")
 9 print("data:\n", data)
10 print("data 的列索引:\n", data.columns)
11 
12 content = []
13 
14 # 分词
15 for tmp in data.loc[:, "内容 "]:
16     print(tmp)
17     # 以精确模式对文章进行分词
18     seg = jieba.cut(tmp, cut_all=False)
19     # seg = jieba.cut(tmp, cut_all=True)
20     seg_ = ",".join(seg)
21 
22     content.append(seg_)
23 
24 # print(content)
25 
26 data.loc[:, "内容 "] = content
27 
28 print(data)
29 
30 # 加载停止词
31 with open("./stopwords.txt", "r", encoding="utf-8") as f:
32     stopwords = f.readlines()
33     # 去除掉停止词前后的空白字符
34     st_list = [tmp.strip() for tmp in stopwords]
35 # print(st_list)
36 # print(len(st_list))
37 # 停止词去重
38 st_list = list(set(st_list))
39 # print("去重之后的停止词长度:\n", len(st_list))
40 
41 # # 2、将文本内容转化为数值类型
42 # #  统计词数 统计词的重要性程度
43 # # 1、实例化对象
44 conv = CountVectorizer(stop_words=st_list)
45 # 2、词数统计
46 x = conv.fit_transform(data.loc[:, "内容 "])
47 print("x: \n", x)
48 
49 # 获取统计的词语
50 feature_names = conv.get_feature_names()
51 res = x.toarray()
52 print(feature_names)
53 print("res:\n", res)
54 
55 # 将目标值获取到
56 data.loc[data.loc[:, "评价"] == "好评", "评价"] = 0
57 data.loc[data.loc[:, "评价"] == "差评", "评价"] = 1
58 
59 # print(data.dtypes)
60 # 将目标值转化为Int类型
61 data.loc[:, "评价"] = data.loc[:, "评价"].astype(np.int64)
62 print("data的type: \n", data.dtypes)
63 
64 # 将转化为数组之后的特征与目标值进行拼接
65 data = np.concatenate((res, data.loc[:, "评价"].values.reshape(-1, 1)), axis=1)
66 
67 print("data:\n", data)
68 print("data:\n", data.dtype)
69 
70 # 拆分成训练集与测试集
71 train = data[[0, 1, 3, 6, 8, 9, 10, 11, 12, 4], :]
72 test = data[[2, 5, 7], :]
73 
74 # 构建朴素贝叶斯算法进行分类
75 # 1、实例化对象
76 nb = MultinomialNB(alpha=1.0)
77 # 2、训练数据
78 nb.fit(train[:, :-1], train[:, -1])
79 # 3、预测数据
80 y_predict = nb.predict(test[:, :-1])
81 
82 # 获取准确率
83 score = nb.score(test[:, :-1], test[:, -1])
84 
85 print("预测结果:\n", y_predict)
86 print("准确率:\n", score)