用Python词云看电影--生而为人,对不起

选取的是《被嫌弃的松子的一生》
在这里插入图片描述
词云效果:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
完整代码:

 1 # 分析豆瓣被嫌弃的松子的一生的影评,生成词云
 2 # https://movie.douban.com/subject/1787291/comments?start=20&limit=20&status=P&sort=new_score
 3 # url = 'https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P '\
 4 # % (movie_id, (i - 1) * 20)
 5 
 6 import requests
 7 from stylecloud import gen_stylecloud
 8 import jieba
 9 import re
10 from bs4 import BeautifulSoup
11 from wordcloud import STOPWORDS
12 
13 headers = {
14      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'
15 }
16 
17 
18 
19 
20 def jieba_cloud(file_name, icon):
21     with open(file_name, 'r', encoding='utf8') as f:
22         word_list = jieba.cut(f.read())
23 
24         result = " ".join(word_list)    # 分词用  隔开
25 
26         # 设置停用词
27         stopwords_file = open('stopwords.txt', 'r', encoding='utf-8')
28         stopwords = [words.strip() for words in stopwords_file.readlines()]
29 
30 
31         # 制作中文词云
32         icon_name = " "
33         if icon == "1":
34             icon_name = "fas fa-grin-hearts"
35         elif icon == "2":
36             icon_name = "fas fa-space-shuttle"
37         elif icon == "3":
38             icon_name = "fas fa-heartbeat"
39         elif icon == "4":
40             icon_name = "fas fa-bug"
41         elif icon == "5":
42             icon_name = "fas fa-thumbs-up"
43         elif icon == "6":
44             icon_name = "fab fa-qq"
45         pic = str(icon) + '.png'
46         if icon_name is not None and len(icon_name) > 0:
47             gen_stylecloud(text=result,
48                            size=1024,  # stylecloud 的大小(长度和宽度)
49                            icon_name=icon_name,
50                            font_path='simsun.ttc',
51                            max_font_size=200,  # stylecloud 中的最大字号
52                            max_words=2000,  # stylecloud 可包含的最大单词数
53                            #stopwords=TRUE,  # 布尔值,用于筛除常见禁用词
54                            custom_stopwords=stopwords,   #定制停用词列表
55                            output_name=pic)
56         else:
57             gen_stylecloud(text=result, font_path='simsun.ttc', output_name=pic)
58         return pic
59 
60 
61 def spider_comment(movie_id, page):
62     comment_list = []
63     with open("douban.txt", "a+", encoding='utf-8') as f:
64         for i in range(1,page+1):
65 
66             url = 'https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' \
67                   % (movie_id, (i - 1) * 20)
68 
69             req = requests.get(url, headers=headers)
70             req.encoding = 'utf-8'
71             comments = re.findall('<span class="short">(.*)</span>', req.text)
72 
73 
74             f.writelines('\n'.join(comments))
75     print(comments)
76 
77 # 主函数
78 if __name__ == '__main__':
79     movie_id = '1787291'
80     page = 10
81     spider_comment(movie_id, page)
82 
83     jieba_cloud("douban.txt", "1")
84     jieba_cloud("douban.txt", "2")
85     jieba_cloud("douban.txt", "3")
86     jieba_cloud("douban.txt", "4")
87     jieba_cloud("douban.txt", "5")
88 
89     jieba_cloud("douban.txt", "6")

 

爬取过程参考:

Python爬取你好李焕英豆瓣短评并利用stylecloud制作更酷炫的词云图

在这里插入图片描述

posted @ 2021-02-26 18:34  BugMiaowu2021  阅读(96)  评论(0编辑  收藏  举报