import random
import jieba
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# 读取停留词
def get_stop():
with open(r"D:\StudyFiles\ExtracurricularFile\中文文本挖掘\end\data\司法数据\stopwords.txt", 'r', encoding="utf-8") as f:
dic = [line.strip() for line in f.readlines()]
return dic
# 去停
def rm_stop(obs):
stop = get_stop()
nostp = []
for i in obs:
if i not in stop:
nostp.append(i)
return nostp
# 分词 - 去停留词 - 合并
def pro_text(docs, sentences, label):
for doc in docs:
words = rm_stop(jieba.lcut(doc))
sentences.append((" ".join(words), label))
if __name__ == '__main__':
# 1、文本收集
ez = pd.read_csv(r"data/司法数据/beierzida.csv")
ne = pd.read_csv(r"data/司法数据/beinverda.csv")
lp = pd.read_csv(r"data/司法数据/beilaopoda.csv")
lg = pd.read_csv(r"data/司法数据/beilaogongda.csv")
ez.dropna(inplace=True)
ne.dropna(inplace=True)
lp.dropna(inplace=True)
lg.dropna(inplace=True)
ez_ls = ez['segment'].values.tolist()
ne_ls = ne['segment'].values.tolist()
lp_ls = lp['segment'].values.tolist()
lg_ls = lg['segment'].values.tolist()
# 2、分词
sentences = []
pro_text(ez_ls, sentences, 0)
pro_text(ne_ls, sentences, 1)
pro_text(lp_ls, sentences, 2)
pro_text(lg_ls, sentences, 3)
# 打乱
# random.shuffle(sentences)
# print(sentences)
# 分离数据
x = [a for a, b in sentences]
y = [b for a, b in sentences]
x_train, x_test, y_train, y_text = train_test_split(x, y, test_size=0.2,
random_state=13) # test_size = 0.2 :测试数据占百分之二十
# 向量化Count,tfidt
cv = CountVectorizer()
# cv = TfidfVectorizer()
cv_train = cv.fit_transform(x_train).toarray()
cv_test = cv.transform(x_test).toarray()
# 4、特征提取
# 5、模型建立
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(cv_train, y_train) # 模型训练,训练数据
score = clf.score(cv_test, y_text) # 模型评价,测试数据
print(score)
y_predict = clf.predict(cv_test) # 预测,测试数据
# print(y_text,y_predict)
# 贝叶斯
nb = GaussianNB()
nb.fit(cv_train, y_train, sample_weight=None)
nb_score = nb.score(cv_test, y_text)
nb_predict = nb.predict(cv_test)
print(nb_score)
# 决策树
dc = DecisionTreeClassifier()
dc.fit(cv_train, y_train)
dc_score = dc.score(cv_test, y_text)
dc_predict = dc.predict(cv_test)
print(dc_score)
分类:
python数据分析
标签:
python数据分析
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)