jieba库分词
安装好jieba库后,搬运了一个博主的代码进行词云测试
# 如果您需要使用此代码,os.chdir路经需要指定到txt文本所在路径 # 使用Zipin函数,需要txt有read()函数可以打开的正确的编码格式 # 使用Cipin函数需要安装jieba库 # 使用word cloud函数需要安装wordcloud与matplotlib库 import os import codecs import jieba import pandas as pd from wordcloud import WordCloud from scipy.misc import imread import matplotlib.pyplot as plt os.chdir("/Users/Zhaohaibo/Desktop") class Hlm(object): # ————————————————————— # Zipin(self, readdoc, writedoc) # readdoc: 要读取的文件名 # writedoc:要写入的文件名 # output # 字频前100,并写入writedoc # ————————————————————— def Zipin(self, readdoc, writedoc): word_lst = [] word_dict = {} exclude_str = ",。!?、()【】<>《》=:+-*—“”…" with open(readdoc,"r") as fileIn ,open(writedoc,'w') as fileOut: # 添加每一个字到列表中 for line in fileIn: for char in line: word_lst.append(char) # 用字典统计每个字出现的个数 for char in word_lst: if char not in exclude_str: if char.strip() not in word_dict: # strip去除各种空白 word_dict[char] = 1 else : word_dict[char] += 1 # 排序x[1]是按字频排序,x[0]则是按字排序 lstWords = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) # 输出结果 (前100) print ('字符\t字频') print ('=============') for e in lstWords[:100]: print ('%s\t%d' % e) fileOut.write('%s, %d\n' % e) # ————————————————————— # Cipin(self, doc) # doc: 要读取的文件名 # return: # 词频表(DataFrame格式) # ————————————————————— def Cipin(self, doc): wdict = {} f = open(doc,"r") for line in f.readlines(): words = jieba.cut(line) for w in words: if(w not in wdict): wdict[w] = 1 else: wdict[w] += 1 # 导入停用词表 stop = pd.read_csv('stoplist.txt', encoding = 'utf-8', sep = 'zhao', header = None,engine = 'python') #sep:分割符号(需要用一个确定不会出现在停用词表中的单词) stop.columns = ['word'] stop = [' '] + list(stop.word) #python读取时不会读取到空格。但空格依旧需要去除。所以加上空格; 读取后的stop是series的结构,需要转成列表 for i in range(len(stop)): if(stop[i] in wdict): wdict.pop(stop[i]) ind = list(wdict.keys()) val = list(wdict.values()) ind = pd.Series(ind) val = pd.Series(val) data = pd.DataFrame() data['词'] = ind data['词频'] = val return data # ————————————————————— # Ciyun(self, doc) # doc: 要读取的文件名 # output: # 词云图 # ————————————————————— def Ciyun(self,doc): g = open(doc,"r").read() back_pic = imread("aixin.jpg") # 设置背景图片 wc = WordCloud( font_path='/System/Library/Fonts/STHeiti Medium.ttc',#设置字体 background_color="white", #背景颜色 max_words=2000,# 词云显示的最大词数 mask=back_pic,#设置背景图片 max_font_size=200, #字体最大值 random_state=42, ).generate(g) plt.figure(figsize=(64,32)) plt.imshow(wc) plt.axis('off') plt.savefig("ciyun.jpg") plt.show() def main(self,readdoc): # self.Zipin(readdoc,writedoc) df = self.Cipin(readdoc) #self.Ciyun(readdoc) return df if __name__ == '__main__': hlm = Hlm() hlm.Zipin("红楼梦.txt","红楼梦字频.txt") df_hlm1 = hlm.main("红楼梦.txt") --------------------- 作者:Iovebecky 来源:CSDN 原文:https://blog.csdn.net/zhaohaibo_/article/details/81902456 版权声明:本文为博主原创文章,转载请附上博文链接!
安装好需要的库和调试之后,就能正常出现词云啦