import random
import jieba
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# 读取停留词
def get_stop():
with open(r"D:\StudyFiles\ExtracurricularFile\中文文本挖掘\end\data\司法数据\stopwords.txt", 'r', encoding="utf-8") as f:
dic = [line.strip() for line in f.readlines()]
return dic
# 去停
def rm_stop(obs):
stop = get_stop()
nostp = []
for i in obs:
if i not in stop:
nostp.append(i)
return nostp
# 分词 - 去停留词 - 合并
def pro_text(docs, sentences, label):
for doc in docs:
words = rm_stop(jieba.lcut(doc))
sentences.append((" ".join(words), label))
if __name__ == '__main__':
# 1、文本收集
ez = pd.read_csv(r"data/司法数据/beierzida.csv")
ne = pd.read_csv(r"data/司法数据/beinverda.csv")
lp = pd.read_csv(r"data/司法数据/beilaopoda.csv")
lg = pd.read_csv(r"data/司法数据/beilaogongda.csv")
ez.dropna(inplace=True)
ne.dropna(inplace=True)
lp.dropna(inplace=True)
lg.dropna(inplace=True)
ez_ls = ez['segment'].values.tolist()
ne_ls = ne['segment'].values.tolist()
lp_ls = lp['segment'].values.tolist()
lg_ls = lg['segment'].values.tolist()
# 2、分词
sentences = []
pro_text(ez_ls, sentences, 0)
pro_text(ne_ls, sentences, 1)
pro_text(lp_ls, sentences, 2)
pro_text(lg_ls, sentences, 3)
# 打乱
# random.shuffle(sentences)
# print(sentences)
# 分离数据
x = [a for a, b in sentences]
y = [b for a, b in sentences]
x_train, x_test, y_train, y_text = train_test_split(x, y, test_size=0.2,
random_state=13) # test_size = 0.2 :测试数据占百分之二十
# 向量化Count,tfidt
cv = CountVectorizer()
# cv = TfidfVectorizer()
cv_train = cv.fit_transform(x_train).toarray()
cv_test = cv.transform(x_test).toarray()
# 4、特征提取
# 5、模型建立
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(cv_train, y_train) # 模型训练,训练数据
score = clf.score(cv_test, y_text) # 模型评价,测试数据
print(score)
y_predict = clf.predict(cv_test) # 预测,测试数据
# print(y_text,y_predict)
# 贝叶斯
nb = GaussianNB()
nb.fit(cv_train, y_train, sample_weight=None)
nb_score = nb.score(cv_test, y_text)
nb_predict = nb.predict(cv_test)
print(nb_score)
# 决策树
dc = DecisionTreeClassifier()
dc.fit(cv_train, y_train)
dc_score = dc.score(cv_test, y_text)
dc_predict = dc.predict(cv_test)
print(dc_score)