利用Jieba对txt进行分词操作并保存在数据库中
前提:已经安装好Jieba和pymysql
1、创建Python项目
2、pom.xml中加入依赖
<dependencies>
<dependency>
<groupId>org.python</groupId>
<artifactId>jython-standalone</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
3、创建data文件夹放入需要进行操作的txt
4、partition.py
import jieba
import os
import pymysql
txtPath = 'E:\\JetBrains\\Hot word cloud\\partition\\data\\'
txtType = 'txt'
txtLists = os.listdir(txtPath)
for txt in txtLists:
f= open(txtPath+txt,encoding='utf-8')
t =f.read()
words = jieba.lcut(t) # 使用精确模式对文本进行分词
counts = {} # 通过键值对的形式存储词语及其出现的次数
for word in words:
if len(word) == 1: # 单个词语不计算在内
continue
if word.isdigit():
continue
else:
counts[word] = counts.get(word, 0) + 1 # 遍历所有词语,每出现一次其对应的值加 1
items = list(counts.items())#将键值对转换成列表
items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序
print('*********'+txt+'*********')
print(len(words))
name = f'{txt}'.__str__()
str = "."
tablename = name[:name.index(str)]
# for i in range(20):
# word,count = items[i]
# print(word,count)
#建立数据库连接
db_conn=pymysql.connect(host="localhost",user="root",password="111",db="hotwordcloud",charset="utf8")
#创建游标对象
cur=db_conn.cursor();
#删除已有表
#创建数据表
sql = """
create table `%s`(
name char(255),
value char(255))
"""%(tablename)
try:
cur.execute(sql)
db_conn.commit()
print("创建表结构成功")
except Exception as err:
print("sql语句执行错误",err)
db_conn.rollback()
#执行sql语句
for i in range(len(items)):
word,count = items[i]
data=(word,count)
# try:
cur.execute("insert into `%s`" %(tablename) + "values('%s','%s')" % (data))
db_conn.commit()
# except Exception as err:
# print("sql语句执行错误",err)
print("插入数据成功")
db_conn.close()
运行程序即可在数据库中创建对应的表,并将分词结果存入表中