Python 作业( 运用Jieba库分词以及运用wordcloud库做词云图 )
Jieba库实例
(1)、运用Jieba库分析三国演义, 得到词频统计, 并对词频进行排序。
(2)、 根据得到的关键词, 做一个词云图
import jieba import wordcloud as wc import matplotlib.pyplot as plt import numpy as np from PIL import Image txt = open("三国演义.txt", "r", encoding = "utf-8").read() excludes = {"将军","却说","荆州","二人","不可","不能","如此","主公","商议","如何","军士","左右","军马"\ ,"引兵","次日","大喜","天下","东吴","于是","今日","不敢","魏兵","陛下","一人","都督","人马","不知"\ ,"汉中","只见","众将","后主","蜀兵","上马","大叫","太守","此人","夫人","先主","后人","背后","城中"} counts = {} words = jieba.lcut(txt) for word in words: if len(word) == 1: continue elif word == "诸葛亮" or word == "孔明曰": rword = "孔明" elif word == "关公" or word == "云长": rword = "关羽" elif word == "玄德" or word == "玄德曰": rword = "刘备" elif word == "孟德" or word == "丞相": rword = "曹操" else: rword = word counts[rword] = counts.get(rword, 0) + 1 for word in excludes: del counts[word] items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) text = '' for w in range(200): text += items[w][0] + ' ' for i in range(20): word, count = items[i] print('{:<10}{:>5}'.format(word, count)) font = "C:\\WINDOWS\\FONTS\\MSYHL.TTC" #该处应写所需字体的路径 bg_pic = np.array(Image.open('C:\\Users\\Administrator\\Desktop\\tree.jpg')) #同样这里写的是背景图片的路径 cloud = wc.WordCloud(font_path=font,#设置字体 background_color="white", #背景颜色 max_words=2000,# 词云显示的最大词数 mask=bg_pic,#设置背景图片 max_font_size=100, #字体最大值 random_state=42) mywc = cloud.generate(text) plt.imshow(mywc) plt.axis('off') plt.show() mywc.to_file('mywc.png')