1 # 分析豆瓣被嫌弃的松子的一生的影评,生成词云 2 # https://movie.douban.com/subject/1787291/comments?start=20&limit=20&status=P&sort=new_score 3 # url = 'https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P '\ 4 # % (movie_id, (i - 1) * 20) 5 6 import requests 7 from stylecloud import gen_stylecloud 8 import jieba 9 import re 10 from bs4 import BeautifulSoup 11 from wordcloud import STOPWORDS 12 13 headers = { 14 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0' 15 } 16 17 18 19 20 def jieba_cloud(file_name, icon): 21 with open(file_name, 'r', encoding='utf8') as f: 22 word_list = jieba.cut(f.read()) 23 24 result = " ".join(word_list) # 分词用 隔开 25 26 # 设置停用词 27 stopwords_file = open('stopwords.txt', 'r', encoding='utf-8') 28 stopwords = [words.strip() for words in stopwords_file.readlines()] 29 30 31 # 制作中文词云 32 icon_name = " " 33 if icon == "1": 34 icon_name = "fas fa-grin-hearts" 35 elif icon == "2": 36 icon_name = "fas fa-space-shuttle" 37 elif icon == "3": 38 icon_name = "fas fa-heartbeat" 39 elif icon == "4": 40 icon_name = "fas fa-bug" 41 elif icon == "5": 42 icon_name = "fas fa-thumbs-up" 43 elif icon == "6": 44 icon_name = "fab fa-qq" 45 pic = str(icon) + '.png' 46 if icon_name is not None and len(icon_name) > 0: 47 gen_stylecloud(text=result, 48 size=1024, # stylecloud 的大小(长度和宽度) 49 icon_name=icon_name, 50 font_path='simsun.ttc', 51 max_font_size=200, # stylecloud 中的最大字号 52 max_words=2000, # stylecloud 可包含的最大单词数 53 #stopwords=TRUE, # 布尔值,用于筛除常见禁用词 54 custom_stopwords=stopwords, #定制停用词列表 55 output_name=pic) 56 else: 57 gen_stylecloud(text=result, font_path='simsun.ttc', output_name=pic) 58 return pic 59 60 61 def spider_comment(movie_id, page): 62 comment_list = [] 63 with open("douban.txt", "a+", encoding='utf-8') as f: 64 for i in range(1,page+1): 65 66 url = 'https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' \ 67 % (movie_id, (i - 1) * 20) 68 69 req = requests.get(url, headers=headers) 70 req.encoding = 'utf-8' 71 comments = re.findall('<span class="short">(.*)</span>', req.text) 72 73 74 f.writelines('\n'.join(comments)) 75 print(comments) 76 77 # 主函数 78 if __name__ == '__main__': 79 movie_id = '1787291' 80 page = 10 81 spider_comment(movie_id, page) 82 83 jieba_cloud("douban.txt", "1") 84 jieba_cloud("douban.txt", "2") 85 jieba_cloud("douban.txt", "3") 86 jieba_cloud("douban.txt", "4") 87 jieba_cloud("douban.txt", "5") 88 89 jieba_cloud("douban.txt", "6")