利用WordCloud和jieba生成词云图(也叫文字云), (同样的代码,有的图片可以生成mask起作用,有的就不起作用,这个还不不知道原因)

Python生成词云的常用库「wordcloud」。安装: pip install wordcloud

wordcloud默认是为了英文文本来做词云的,如果需要制作中文文本词云,就需要先对中文进行分词。这里就需要用到中文分词库「jieba」。安装:pip install jieba

 

 

复制代码
# coding: utf-8
# Project:pythonProject8
# File:词云图.py
# Author:李凤娟
# Date :2023/9/12 14:09
# IDE:PyCharm 
# 功能:生成词云图

from wordcloud import WordCloud
import jieba
from collections import Counter
from matplotlib import pyplot as plt
from imageio.v2 import imread


"""
word.txt里的内容为:
Python
C逆向
C++逆向
C++逆向
C++逆向
C逆向
网络爬虫
数据解析
"""
# 读取文件内容
with open('word.txt', 'r', encoding='utf-8') as f:
    words = f.read()

# 增加jieba库词语(一些完整的词可能被分割)
jieba.add_word("网络爬虫")
jieba.add_word("JS逆向")
jieba.add_word("APP逆向")
jieba.add_word("C逆向")
jieba.add_word("C++逆向")
jieba.add_word("网络数据")

# 使用jieba进行分词
words_list_jieba = jieba.lcut(words)    # ['Python', '\n', 'C逆向', '\n', 'C++逆向', '\n', 'C++逆向', '\n', 'C++逆向', '\n', 'C逆向', '\n', '网络爬虫', '\n', '数据', '解析']

# 定义需要排除的词语集合
excluded_words = ["", "\n", "", '小说', '', "", "一雄", "如何", "什么", '可以', '', "只是", "", "", '', " ",
                  "", "", "", "", '那么', '', '', '', '', '不是', '', "", "就是", "一个", "没有", "",
                  "", "/", "", "", "一部", "", "", "", "", "我们", "你们", "他们", "", "", "", "",
                  "", "", "", "", "", "", "如果", "然后", "因为", "所以", "", "", "", "", "", "",
                  "", "", "", "", "", "", "", "", "这些", "那些", "", "", "", "", "", "", "",
                  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
                  "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
                  "", "", "", "这个", "那个", "这样", "那样", "一些", "很多", "非常", "可能", "一定", "一直", "经常",
                  "不断", "不只", "不要", "不得", "不能", "无法", "没法", "必须", "应该", "需要", "", "", "", "",
                  "", "", "", "", "", "", "", "", "", "", "", "工作", "生活", "家庭", "朋友", "",
                  "感觉", "思考", "想法", "方法", "原因", "结果", "可能性", "比较", "不同", "相同", "重要", "容易", "困难",
                  "简单", "复杂", "正确", "错误", "", "", "", "", "", "", "", "", "", "", "", "",
                  "", "", "", "", "——", "", "·", "", "", "", "“", "”", ",", "\'", "\"", ","]
# 过滤排除的词语
words_list = [x for x in words_list_jieba if x not in excluded_words]   # ['Python', 'C逆向', 'C++逆向', 'C++逆向', 'C++逆向', 'C逆向', '网络爬虫', '数据', '解析']

# 使用Counter进行词频统计
word_counter = Counter(words_list)  # Counter({'C++逆向': 3, 'C逆向': 2, 'Python': 1, '网络爬虫': 1, '数据': 1, '解析': 1})
sorted_file = word_counter.most_common()    #[('C++逆向', 3), ('C逆向', 2), ('Python', 1), ('网络爬虫', 1), ('数据', 1), ('解析', 1)]
print(sorted_file)
# 加载图像作为遮罩
mask = imread("2.png")
# 生成词云时指定遮罩
# 将字体文件(simhei.ttf)放到本项目目录下,或者指定C:\Windows\Fonts\simhei.ttf目录
wordcloud = WordCloud(
    font_path='simhei.ttf',
    background_color='white',
    mask=mask).generate_from_frequencies(dict(sorted_file))

# 保存词云图
wordcloud_image_path = 'wordcloud.png'
wordcloud.to_file(wordcloud_image_path)

# 到目前为止图片生成完毕!!!!



# 下边只是展示图片,无所谓的
# 生成图片
image = wordcloud.to_image()
# 展示图片
image.show()

# 显示词云图
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.figure()
plt.show()
复制代码

 

把WordCloud()里的mask去掉,就会生成指定大小的图片
 
wordcloud = WordCloud(
    font_path='simhei.ttf',
    background_color='white',
    height=1000,
    width=800
    ).generate_from_frequencies(dict(sorted_file))

 



stopwords 的用法
复制代码
stopwords = {}.fromkeys(["爬虫"])
# 生成词云时指定遮罩
# 将字体文件(simhei.ttf)放到本项目目录下,或者指定C:\Windows\Fonts\simhei.ttf目录
wordcloud = WordCloud(
    font_path='simhei.ttf',
    background_color='white',
    height=600,
    width=800,
    stopwords=stopwords,
   mask=mask).generate_from_frequencies(dict(sorted_file))
复制代码

 

posted @   勋勋的大宝贝  阅读(404)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· DeepSeek “源神”启动!「GitHub 热点速览」
· 我与微信审核的“相爱相杀”看个人小程序副业
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
· 上周热点回顾(2.17-2.23)
点击右上角即可分享
微信分享提示