代码很简单,一看就懂。
(没有模拟点击,所以都是未展开的)
地址:
https://movie.douban.com/subject/26266893/reviews?rating=&start=0
这里start就是后面参数变化的地方。一页20条,所以循环的话 每次start加20就好。
代码:
import requests from bs4 import BeautifulSoup page=0 # url = 'https://movie.douban.com/subject/26266893/reviews?rating=&start='+str(page) # s = requests.session() # res = s.get(url).text # soup = BeautifulSoup(res,'html.parser') # comments = soup.find_all("div",class_="short-content") # print (comments[2].get_text().replace("(展开)"," ").strip()) while 1: url = 'https://movie.douban.com/subject/26266893/reviews?rating=&start=' + str(page) s = requests.session() res = s.get(url).text soup = BeautifulSoup(res, 'html.parser') comments = soup.find_all("div", class_="short-content") with open('流浪地球.txt', 'a', encoding='gb18030', errors='ignore') as f: for comment in comments: str_comment = comment.get_text().replace("(展开)"," ").strip() text = str_comment.replace("这篇影评可能有剧透"," ").strip() f.write(text+"\n") page+=20
生成词云代码:
# -*- coding: utf-8 -*- from wordcloud import WordCloud import matplotlib.pyplot as plt import jieba # 生成词云 def create_word_cloud(filename): text = open("流浪地球.txt".format(filename), encoding='gb18030', errors='ignore').read() # 结巴分词 wordlist = jieba.cut(text, cut_all=True) wl = " ".join(wordlist) # 设置词云 wc = WordCloud( # 设置背景颜色 background_color="white", # 设置最大显示的词云数 max_words=2000, # 这种字体都在电脑字体中,一般路径 font_path='C:\Windows\Fonts\simfang.ttf', height=1200, width=1600, # 设置字体最大值 max_font_size=100, # 设置有多少种随机生成状态,即有多少种配色方案 random_state=30, ) myword = wc.generate(wl) # 生成词云 # 展示词云图 plt.imshow(myword) plt.axis("off") plt.show() wc.to_file('py_book_流浪地球.png') # 把词云保存下 if __name__ == '__main__': create_word_cloud('word_py')
结果:
撸码千万条,简洁第一条。代码不规范,编译两行泪。