新闻分类-中文分词+词云展示(1)
1、导入数据创建存储关键词表(此处使用MySQL)
2、使用jieba进行分词统计并存储到表中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | <code - pre class = "code-pre" id = "pre-eYyMGm" ><code - line class = "line-numbers-rows" >< / code - line> # -*- coding: utf-8 -*- <code - line class = "line-numbers-rows" >< / code - line> import pandas as pd <code - line class = "line-numbers-rows" >< / code - line> import pymysql <code - line class = "line-numbers-rows" >< / code - line> import jieba<br> #此处只是其中一个表的分词 <code - line class = "line-numbers-rows" >< / code - line> def getdata(): <code - line class = "line-numbers-rows" >< / code - line> dbconn = pymysql.connect(host = "127.0.0.1" , database = "test1125" , user = "root" , password = "", port = 3306 , charset = 'utf8' ) <code - line class = "line-numbers-rows" >< / code - line> #sql语句 <code - line class = "line-numbers-rows" >< / code - line> sqlcmd = "select content from sheet_car limit 80" <code - line class = "line-numbers-rows" >< / code - line> #利用pandas 模块导入mysql数据 <code - line class = "line-numbers-rows" >< / code - line> titles = pd.read_sql(sqlcmd,dbconn) <code - line class = "line-numbers-rows" >< / code - line> keywords = "" <code - line class = "line-numbers-rows" >< / code - line> print (titles.values) <code - line class = "line-numbers-rows" >< / code - line> for i in range ( len (titles)): <code - line class = "line-numbers-rows" >< / code - line> str = ( "," ).join(titles.values[i]) <code - line class = "line-numbers-rows" >< / code - line> word_list = jieba.cut( str ) <code - line class = "line-numbers-rows" >< / code - line> keywords = list (word_list) <code - line class = "line-numbers-rows" >< / code - line> count = 0 <code - line class = "line-numbers-rows" >< / code - line> for count in range ( len (keywords)): <code - line class = "line-numbers-rows" >< / code - line> if checkword(keywords[count]): <code - line class = "line-numbers-rows" >< / code - line> flag = checkre(pymysql.connect(host = "127.0.0.1" , database = "test1125" , user = "root" , password = "", port = 3306 , charset = 'utf8' ), keywords[count]) <code - line class = "line-numbers-rows" >< / code - line> if flag: <code - line class = "line-numbers-rows" >< / code - line> save_keywords(pymysql.connect(host = "127.0.0.1" , database = "test1125" , user = "root" , password = "", port = 3306 , charset = 'utf8' ), keywords[count]) <code - line class = "line-numbers-rows" >< / code - line> print (keywords[count]) <code - line class = "line-numbers-rows" >< / code - line> else : <code - line class = "line-numbers-rows" >< / code - line> updatenum(pymysql.connect(host = "127.0.0.1" , database = "test1125" , user = "root" , password = "", port = 3306 , charset = 'utf8' ), keywords[count]) <code - line class = "line-numbers-rows" >< / code - line> else : <code - line class = "line-numbers-rows" >< / code - line> print ( "未知词语" ) <code - line class = "line-numbers-rows" >< / code - line> def checkword(word): <code - line class = "line-numbers-rows" >< / code - line> invalid_words = [ ',' , '.' , ',' , '。' , ':' , '“' , '”' , '"' , '?' , '?' , '《' , '》' , '(' , '{' , ')' , '}' , '!' , '%' , '℃' , '¥' , '#' ] <code - line class = "line-numbers-rows" >< / code - line> if word.lower() in invalid_words: <code - line class = "line-numbers-rows" >< / code - line> return False <code - line class = "line-numbers-rows" >< / code - line> else : <code - line class = "line-numbers-rows" >< / code - line> return True <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> def save_keywords(db, keyword): <code - line class = "line-numbers-rows" >< / code - line> # 使用cursor()方法获取操作游标 <code - line class = "line-numbers-rows" >< / code - line> cursor = db.cursor() <code - line class = "line-numbers-rows" >< / code - line> # SQL 插入语句 <code - line class = "line-numbers-rows" >< / code - line> sql = "INSERT INTO key_sheet_car(keywords,num) VALUES ('%s',1)" % (keyword) <code - line class = "line-numbers-rows" >< / code - line> try : <code - line class = "line-numbers-rows" >< / code - line> # 执行sql语句 <code - line class = "line-numbers-rows" >< / code - line> cursor.execute(sql) <code - line class = "line-numbers-rows" >< / code - line> # 执行sql语句 <code - line class = "line-numbers-rows" >< / code - line> print ( "true" ) <code - line class = "line-numbers-rows" >< / code - line> db.commit() <code - line class = "line-numbers-rows" >< / code - line> except : <code - line class = "line-numbers-rows" >< / code - line> print ( "数据插入失败" ) <code - line class = "line-numbers-rows" >< / code - line> # 发生错误时回滚 <code - line class = "line-numbers-rows" >< / code - line> db.rollback() <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> # 关闭数据库连接 <code - line class = "line-numbers-rows" >< / code - line> db.close() <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> def updatenum(db,keyword): <code - line class = "line-numbers-rows" >< / code - line> # 使用cursor()方法获取操作游标 <code - line class = "line-numbers-rows" >< / code - line> cursor = db.cursor() <code - line class = "line-numbers-rows" >< / code - line> # SQL 插入语句 <code - line class = "line-numbers-rows" >< / code - line> sql = "update key_sheet_car set num=num+1 where keywords = '%s' " % keyword <code - line class = "line-numbers-rows" >< / code - line> try : <code - line class = "line-numbers-rows" >< / code - line> # 执行sql语句 <code - line class = "line-numbers-rows" >< / code - line> cursor.execute(sql) <code - line class = "line-numbers-rows" >< / code - line> # 执行sql语句 <code - line class = "line-numbers-rows" >< / code - line> db.commit() <code - line class = "line-numbers-rows" >< / code - line> except : <code - line class = "line-numbers-rows" >< / code - line> print ( "数据更新失败" ) <code - line class = "line-numbers-rows" >< / code - line> # 发生错误时回滚 <code - line class = "line-numbers-rows" >< / code - line> db.rollback() <code - line class = "line-numbers-rows" >< / code - line> # 关闭数据库连接 <code - line class = "line-numbers-rows" >< / code - line> db.close() <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> def checkre(db, keyword): <code - line class = "line-numbers-rows" >< / code - line> # 使用cursor()方法获取操作游标 <code - line class = "line-numbers-rows" >< / code - line> cursor = db.cursor() <code - line class = "line-numbers-rows" >< / code - line> ket = [] <code - line class = "line-numbers-rows" >< / code - line> # SQL 插入语句 <code - line class = "line-numbers-rows" >< / code - line> ket = [] <code - line class = "line-numbers-rows" >< / code - line> sql = "select keywords from key_sheet_car where keywords = '%s'" % keyword <code - line class = "line-numbers-rows" >< / code - line> try : <code - line class = "line-numbers-rows" >< / code - line> # 执行sql语句 <code - line class = "line-numbers-rows" >< / code - line> cursor.execute(sql) <code - line class = "line-numbers-rows" >< / code - line> ket = list (cursor.fetchall()) <code - line class = "line-numbers-rows" >< / code - line> db.commit() <code - line class = "line-numbers-rows" >< / code - line> except : <code - line class = "line-numbers-rows" >< / code - line> print ( "查询数据失败" ) <code - line class = "line-numbers-rows" >< / code - line> # 发生错误时回滚 <code - line class = "line-numbers-rows" >< / code - line> db.rollback() <code - line class = "line-numbers-rows" >< / code - line> # 关闭数据库连接 <code - line class = "line-numbers-rows" >< / code - line> db.close() <code - line class = "line-numbers-rows" >< / code - line> if ket: <code - line class = "line-numbers-rows" >< / code - line> return False <code - line class = "line-numbers-rows" >< / code - line> else : <code - line class = "line-numbers-rows" >< / code - line> return True <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> <code - line class = "line-numbers-rows" >< / code - line> if __name__ = = '__main__' : <code - line class = "line-numbers-rows" >< / code - line> getdata() <code - line class = "line-numbers-rows" >< / code - line> < / code - pre> |
3、项目结构
__EOF__

本文作者:往心。
本文链接:https://www.cnblogs.com/lx06/p/15650851.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。您的鼓励是博主的最大动力!
本文链接:https://www.cnblogs.com/lx06/p/15650851.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角【推荐】一下。您的鼓励是博主的最大动力!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· AI与.NET技术实操系列(六):基于图像分类模型对图像进行分类