python学习_新增了一个jieba库和wordcloud文件生成词云
版本:
新增了一个jieba库和wordcloud文件生成词云
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' __author__ = '王益夫' __mtime__ = '2019/12/20' ''' ''' 版本修改: V 1.0:get代码获取的文本内容,通过jieba库和词云进行分析 ''' import jieba from wordcloud import WordCloud from os import path import re import matplotlib.pyplot as plt #from scipy.misc import imread import imageio file_path = path.dirname(__file__) + r'/temp' file_name1 = r'新闻联播.txt' file_name2 = r'StopWords.txt' file_name3 = r'AddWords.txt' TextPath = file_path + '/' + file_name1 StopWordsPath = file_path + '/' + file_name2 AddWordsPath = file_path + '/' + file_name3 print(AddWordsPath) def jiebaclearText(text): mywordslist = [] seg_list = jieba.cut(text, cut_all=False) #seg_list = jieba.cut(TestStr, cut_all=True) 全模式:该模式将语料中所有可以组合成词的词语都构建出来,其优点是速度非常快,缺点是不能解决歧义问题,并且分词结果不太准确。 #seg_list = jieba.cut(TestStr, cut_all=False) 默认模式:该模式利用其算法将句子最精确地分隔开,适合文本分析,通常采用这种模式进行中文分词。 #seg_list = jieba.cut_for_search(TestStr) 搜索引擎模式:该模式是在精确模式基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。 liststr = "/".join(seg_list) f_stop = open(StopWordsPath, encoding='utf-8', errors='ignore') try: f_stop_text = f_stop.read() finally: f_stop.close() f_stop_seg_list = f_stop_text.split('\n') for myword in liststr.split('/'): if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1: mywordslist.append(myword) return ' '.join(mywordslist) def addWordsRulls(text): addwords_list = set() try: results = re.findall('《[^》]+》', text) for result in results: addwords_list.add(result) #jieba.add_word(result) return True except Exception as e: raise e addwords_list.add('EOR:ADD正则解析失败,未获取关键词!') return False finally: with open(AddWordsPath, 'a+', encoding='utf-8', errors='ignore') as file_add: for line in list(addwords_list): file_add.write(line + '\n') def StopWordsRulls(text): Stopwords_list = set() try: results = re.findall('\d{4}年\d{1,2}月\d{1,2}日', text) for result in results: print(result) Stopwords_list.add(result) #jieba.add_word(result) return True except Exception as e: raise e Stopwords_list.add('EOR:Stop正则解析失败,未获取关键词!') return False finally: with open(StopWordsPath, 'a+', encoding='utf-8', errors='ignore') as file_Stop: for line in list(Stopwords_list): file_Stop.write(line + '\n') def main(): with open(TextPath, encoding='utf-8', errors='ignore') as file_Text: text = file_Text.read() # for key in analyse.extract_tags(text, 50, withWeight=False): # print(key) if addWordsRulls(text) and StopWordsRulls(text): with open(AddWordsPath, 'r', encoding='utf-8', errors='ignore') as file_read: context = set(file_read.read()) for line in context: jieba.add_word(line) text_text = jiebaclearText(text) color_mask = imageio.imread(file_path + "/template.jpeg") cloud = WordCloud( # 设置字体,不指定就会出现乱码 font_path="./temp/HYQiHei-25J.ttf", # font_path=path.join(d,'simsun.ttc'), # 设置背景色 background_color='white', # 词云形状 mask=color_mask, # 允许最大词汇 max_words=200, # 最大号字体 max_font_size=40 ) # wordcloud = WordCloud(background_color="white", width=1000, height=860, margin=2).generate(text_text) word_cloud = cloud.generate(text_text) # 产生词云 word_cloud.to_file("test.jpg") # 保存图片 # 显示词云图片 plt.imshow(word_cloud) plt.axis('off') plt.show() if __name__ == '__main__': main()