本文记录使用request,以及正则表达式re爬取影评的过程,关于request的安装,可以使用:pip3 install requests
def login_douban(): try: login_url = '' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0', 'Referer': '' } data = { 'name': '你的用户名', 'password': '你的密码', 'remember': 'false' } response =, headers=headers, data=data) if response.status_code == 200: return response.text return None except RequestException: print('登录失败') return None
session = requests.Session() def get_comment_one_page(page=0): start = int(page * 20) comment_url = '' % start headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0', } try: response = session.get(comment_url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('爬取评论失败') return None
'<div class="comment-item".*?comment-info">.*?rating".*?title="(.*?)">.*?"comment-time.*?title="(.*?)">.*?short">(.*?)</span>.*?</div>'
<div class="comment-item" data-cid="2012951340"> <div class="avatar"> <a title="天天天蓝" href=""> <img src="" class=""> </a> </div> <div class="comment"> <h3> <span class="comment-vote"> <span class="votes">8867</span> <input value="2012951340" type="hidden"> <a href="javascript:;" class="j a_show_login" onclick="">有用</a> </span> <span class="comment-info"> <a href="" class="">天天天蓝</a> <span>看过</span> <span class="allstar50 rating" title="力荐"></span> <span class="comment-time " title="2019-10-25 09:34:22"> 2019-10-25 </span> </span> </h3> <p class=""> <span class="short">应该创造怎样的世界让少年成长是这个电影的主题...</span> </p> </div> </div>
def parse_comment_one_page(html): pattern = re.compile( '<div class="comment-item".*?comment-info">.*?rating".*?title="(.*?)">.*?"comment-time.*?title="(.*?)">.*?short">(.*?)</span>.*?</div>', re.S) items = re.findall(pattern, html) for item in items: yield{ 'star': item[0], 'time': item[1], 'context': item[2] }
def get_comment_all_page(): page = 0 html = get_comment_one_page(page) condition = html is not None while condition: for item in parse_comment_one_page(html): print(item) write_to_file(item) #save_data_base(parse_comment_one_page(html)) page += 1 html = get_comment_one_page(page) time.sleep(random.random() * 3) print('爬取完毕')
def write_to_file(comments): with open(COMMENTS_FILE_PATH, 'a', encoding='utf-8') as f: f.write(json.dumps(comments, ensure_ascii=False)+'\n') f.close() def save_data_base(items): list = [] for item in items: jsonstr = json.dumps(item, ensure_ascii=False) context = json.loads(jsonstr) tup = (context['star'], context['time'], context['context']) list.append(tup) connection = pymysql.connect( host='', port=10080, user='test', password='123456', db='webspider') cursor = connection.cursor() cmd = "insert into shaoniandeni (star,time,context) values (%s,%s,%s)" try: cursor.executemany(cmd, list) connection.commit() except: connection.rollback() traceback.print_exc() finally: cursor.close() connection.close()
import os import re import json import time import jieba import random import requests import traceback import pymysql import numpy as np import pymysql.cursors from PIL import Image from wordcloud import WordCloud import matplotlib.pyplot as plt from requests.exceptions import RequestException COMMENTS_FILE_PATH = 'douban_comments.txt' # 词云字体 WC_FONT_PATH = 'C:\\Windows\\fonts\\STFANGSO.TTF' # 词云形状图片 WC_MASK_IMG = 'index.jpg' session = requests.Session() def login_douban(): try: login_url = '' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0', 'Referer': '' } data = { 'name': '', 'password': '', 'remember': 'false' } response =, headers=headers, data=data) if response.status_code == 200: return response.text return None except RequestException: print('登录失败') return None def get_comment_one_page(page=0): start = int(page * 20) comment_url = '' % start headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0', } try: response = session.get(comment_url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('爬取评论失败') return None def parse_comment_one_page(html): pattern = re.compile( '<div class="comment-item".*?comment-info">.*?rating".*?title="(.*?)">.*?"comment-time.*?title="(.*?)">.*?short">(.*?)</span>.*?</div>', re.S) items = re.findall(pattern, html) for item in items: yield{ 'star': item[0], 'time': item[1], 'context': item[2] } def write_to_file(comments): with open(COMMENTS_FILE_PATH, 'a', encoding='utf-8') as f: f.write(json.dumps(comments, ensure_ascii=False)+'\n') f.close() def get_comment_all_page(): page = 0 html = get_comment_one_page(page) condition = html is not None while condition: for item in parse_comment_one_page(html): print(item) write_to_file(item) #save_data_base(parse_comment_one_page(html)) page += 1 html = get_comment_one_page(page) time.sleep(random.random() * 3) print('爬取完毕') def save_data_base(items): list = [] for item in items: jsonstr = json.dumps(item, ensure_ascii=False) context = json.loads(jsonstr) tup = (context['star'], context['time'], context['context']) list.append(tup) connection = pymysql.connect( host='', port=10080, user='test', password='123456', db='webspider') cursor = connection.cursor() cmd = "insert into shaoniandeni (star,time,context) values (%s,%s,%s)" try: cursor.executemany(cmd, list) connection.commit() except: connection.rollback() traceback.print_exc() finally: cursor.close() connection.close() def cut_word(): """ 对数据分词 :return: 分词后的数据 """ with open(COMMENTS_FILE_PATH, "r", encoding="utf-8") as file: comment_txt = jieba.add_word('周冬雨') jieba.add_word('易烊千玺') jieba.add_word('白夜行') jieba.add_word('东野圭吾') wordlist = jieba.cut(comment_txt, cut_all=True) wl = " ".join(wordlist) #print(wl) return wl def create_word_cloud(): """ 生成词云 :return: """ # 设置词云形状图片 wc_mask = np.array( # 数据清洗词列表 stop_words = ['就是', '不是', '但是', '还是','这种', '只是', '这样', '这个', '一个', '什么', '电影', '没有', '真的','周冬雨','易烊千玺','冬雨','千玺','我们','他们','少年'] # 设置词云的一些配置,如:字体,背景色,词云形状,大小 wc = WordCloud(background_color="white", max_words=50, mask=wc_mask, scale=4, max_font_size=50, random_state=42, stopwords=stop_words, font_path=WC_FONT_PATH) # 生成词云 wc.generate(cut_word()) # 在只设置mask的情况下,你将会得到一个拥有图片形状的词云 plt.imshow(wc, interpolation="bilinear") plt.axis("off") plt.figure() def main(): #login_douban() #get_comment_all_page() create_word_cloud() if __name__ == '__main__': main()