1 import pandas as pd 2 from sklearn.feature_extraction.text import CountVectorizer 3 import jieba 4 import numpy as np 5 from sklearn.naive_bayes import MultinomialNB 6 7 # 1、加载数据 8 data = pd.read_csv("./data.csv", encoding="ansi") 9 print("data:\n", data) 10 print("data 的列索引:\n", data.columns) 11 12 content = [] 13 14 # 分词 15 for tmp in data.loc[:, "内容 "]: 16 print(tmp) 17 # 以精确模式对文章进行分词 18 seg = jieba.cut(tmp, cut_all=False) 19 # seg = jieba.cut(tmp, cut_all=True) 20 seg_ = ",".join(seg) 21 22 content.append(seg_) 23 24 # print(content) 25 26 data.loc[:, "内容 "] = content 27 28 print(data) 29 30 # 加载停止词 31 with open("./stopwords.txt", "r", encoding="utf-8") as f: 32 stopwords = f.readlines() 33 # 去除掉停止词前后的空白字符 34 st_list = [tmp.strip() for tmp in stopwords] 35 # print(st_list) 36 # print(len(st_list)) 37 # 停止词去重 38 st_list = list(set(st_list)) 39 # print("去重之后的停止词长度:\n", len(st_list)) 40 41 # # 2、将文本内容转化为数值类型 42 # # 统计词数 统计词的重要性程度 43 # # 1、实例化对象 44 conv = CountVectorizer(stop_words=st_list) 45 # 2、词数统计 46 x = conv.fit_transform(data.loc[:, "内容 "]) 47 print("x: \n", x) 48 49 # 获取统计的词语 50 feature_names = conv.get_feature_names() 51 res = x.toarray() 52 print(feature_names) 53 print("res:\n", res) 54 55 # 将目标值获取到 56 data.loc[data.loc[:, "评价"] == "好评", "评价"] = 0 57 data.loc[data.loc[:, "评价"] == "差评", "评价"] = 1 58 59 # print(data.dtypes) 60 # 将目标值转化为Int类型 61 data.loc[:, "评价"] = data.loc[:, "评价"].astype(np.int64) 62 print("data的type: \n", data.dtypes) 63 64 # 将转化为数组之后的特征与目标值进行拼接 65 data = np.concatenate((res, data.loc[:, "评价"].values.reshape(-1, 1)), axis=1) 66 67 print("data:\n", data) 68 print("data:\n", data.dtype) 69 70 # 拆分成训练集与测试集 71 train = data[[0, 1, 3, 6, 8, 9, 10, 11, 12, 4], :] 72 test = data[[2, 5, 7], :] 73 74 # 构建朴素贝叶斯算法进行分类 75 # 1、实例化对象 76 nb = MultinomialNB(alpha=1.0) 77 # 2、训练数据 78 nb.fit(train[:, :-1], train[:, -1]) 79 # 3、预测数据 80 y_predict = nb.predict(test[:, :-1]) 81 82 # 获取准确率 83 score = nb.score(test[:, :-1], test[:, -1]) 84 85 print("预测结果:\n", y_predict) 86 print("准确率:\n", score)