jieba文本分词,去除停用词,添加用户词
import jieba from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np import jieba.analyse from pyquery import PyQuery santi_text = open('./santi.txt', 'r', encoding='utf-8').read() #读取本地文档 jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 jieba.load_userdict('./userdict.txt')#加载外部 用户词典 # 创建停用词list def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords # 对句子去除停用词 def movestopwords(sentence): stopwords = stopwordslist('./stop_words.txt') # 这里加载停用词的路径 santi_words =[x for x in sentence if len(x) >1 and x not in stopwords] return santi_words def main(): words = jieba.cut(PyQuery(santi_text).text()) #去除HTML标签 word_list = movestopwords(words) # 去除停用词 words_split = " ".join(word_list) #列表解析为字符串 print('以下是tf-tdf算法-------------------------------------------------') keywords_tf = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) # tf-tdf算法 for item in keywords_tf: print(item[0],item[1]) print('以下是textrank算法-------------------------------------------------') keywords_rank = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) #textrank算法 for item in keywords_rank: print(item[0],item[1]) print('以下是纯词频统计-------------------------------------------------') mycount = Counter(word_list) # 统计词频 for key, val in mycount.most_common(100): # 有序(返回前10个) print(key, val) #alice_mask = np.array(Image.open("./zhihu.png")) #遮罩 wc = WordCloud( # width=800, # height=600, background_color="#000000", # 设置背景颜色 max_words=50, # 词的最大数(默认为200) max_font_size=400, # 最大字体尺寸 min_font_size=10, # 最小字体尺寸(默认为4) #colormap='bone', # string or matplotlib colormap, default="viridis" random_state=42, # 设置有多少种随机生成状态,即有多少种配色方案 #mask=plt.imread("./zhihu.png"), # 读取遮罩图片!! #mask=alice_mask, #设置遮罩 font_path='./SimHei.ttf' ) my_wordcloud = wc.generate(words_split) #按词频生成词云 plt.imshow(my_wordcloud) #展示词云 plt.axis("off") #去除横纵轴 plt.show() wc.to_file('zzz.png') # 保存图片文件 if __name__ == '__main__': main()