Loading

Python爬虫——豆瓣网(电影/书籍)评论并可视化

通过爬虫,获取 豆瓣网 的电影和书籍短评的数据(可视化)

实例化需要输入movie或者book类别,还有相应的id
可以大概的对电影或书籍的信息可视化(仅供参考)

爬取主要使用requests库
数据提取使用re正则
加入了3秒延迟睡眠
使用pandas简单的数据分析统计
matplotlib绘制图表
使用了结巴分词处理短评,生成词云
数据保存在当前目录下
在这里插入图片描述
在这里插入图片描述

# -*- coding: utf-8 -*-
# @Author  : LuoXian
# @Date    : 2020/2/11 22:07
# Software : PyCharm
# version: Python 3.8
# @File    : demo.py

# 导入需要的库
import time
import re
import requests
import seaborn as sns
import jieba  # 结巴分词
import pandas as pd  # 数据分析
import matplotlib.pyplot as plt  # 绘制图表
from wordcloud import WordCloud  # 词云


# 创建分析豆瓣评论数据的类
class DouBanshorts(object):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
         Chrome/79.0.3945.130 Safari/537.36',
    }

    def __init__(self):
        self.douban_type = input('请输入需要获取的类型(movie/book):')
        self.douban_id = input('请输入电影/书籍的id:')
        print('\t开始数据分析豆瓣短评\n-------------------------------\n')

    # 请求页面数据
    @staticmethod
    def get_html(url):
        r = requests.get(url, headers=DouBanshorts.headers)
        r.encoding = 'utf-8'
        return r.text

    # 获取书籍名称
    def get_book_name(self):
        url = f'https://book.douban.com/subject/{self.douban_id}/comments/hot?p=1' if self.douban_type == 'book' \
            else f'https://movie.douban.com/subject/{self.douban_id}/comments?start=0&limit=20&sort=new_score&status=P'
        html = DouBanshorts.get_html(url)
        book_name = re.findall('<h1>(.*?) 短评</h1>', html)[0]
        return book_name

    # 提取shorts和stars数据
    def parse_html(self):
        book_name = self.get_book_name()
        print(book_name)
        if self.douban_type == 'book':
            urls = [f'https://book.douban.com/subject/{self.douban_id}/comments/hot?p={i}' for i in range(1, 100)]
        else:
            urls = [f'https://movie.douban.com/subject/{self.douban_id}/comments?start={i}&limit=20&sort=new_score&status=P' for i in range(0, 1000, 20)]
        shorts, stars = [], []
        # 编译正则表达式
        re_short = re.compile('<span class="short">(.*?)</span>', re.S)
        re_star = re.compile('rating" title="(.*?)"></span>')
        count = 0
        # 循环获取影评 和 stars
        for url in urls:
            count += 1
            print(f'提取第{count}页评论...')
            html = DouBanshorts.get_html(url)
            short = re_short.findall(html)
            eva = re_star.findall(html)
            # 当提取到评论后,将数据写入列表
            if short:
                print(short[0])
                for i in zip(short, eva):
                    shorts.append(i[0])
                    stars.append(i[1])
            else:
                break
            time.sleep(3)
        print('\n数据提取完毕...over\n')
        self.pandas_data(book_name, stars, shorts)

    # pandas 保存数据
    def pandas_data(self, book_name, stars, shorts):
        # 用pandas保存为csv数据
        data = pd.DataFrame()
        data['stars'] = stars
        data['shorts'] = shorts
        data.to_csv(f'{book_name}.csv', index=False, encoding='utf-8', mode='a')

    # 绘制饼图的函数
    def pie(self, data=None, title=None, bookName='饼图', length=6, height=6, dpi=100):
        sns.set_style('ticks')  # 设置绘图风格
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题
        fig, ax = plt.subplots(figsize=(length, height), dpi=dpi)
        size = 0.5
        labels = data.index
        ax.pie(data, labels=labels,
               startangle=90, autopct='%.1f%%', colors=sns.color_palette('husl', len(data)),
               radius=1,  # 控制饼图半径,默认为1
               pctdistance=0.75,  # 控制百分比显示位置
               wedgeprops=dict(width=size, edgecolor='w'),  # 控制甜甜圈的宽度,边缘颜色等
               textprops=dict(fontsize=10)  # 控制字号及颜色
               )
        ax.set_title(title, fontsize=15)
        plt.savefig(f'{bookName}_可视化.png')  # 保存图片
        plt.show()

    # 展示饼图
    def show_chart(self, book_name):
        df = pd.read_csv(f'{book_name}.csv')
        stars = df['stars'].value_counts().sort_values(ascending=False)
        print(stars)
        print('生成饼图中...')
        stars_title = f'{book_name} Stars Show:'
        self.pie(stars, stars_title, book_name)

    # 展示词云 , encoding='gbk'
    def show_words(self, book_name):
        print('生成词云中...')
        df = pd.read_csv(f'{book_name}.csv')
        df['shorts'].to_csv(f'{book_name}.txt', encoding='utf-8', index=False)
        with open(f'{book_name}.txt', 'r', encoding='utf-8') as f:
            content = f.read()
            words = jieba.lcut(content)
            all_words = ' '.join(words)
            word_cloud = WordCloud(background_color='white', width=1280, height=720, font_path='msyh.ttc',
                                   max_words=200, max_font_size=100).generate(all_words)
            word_cloud.to_file(f'{book_name}词云.png')

    # 输入书籍id获取数据
    def super(self):
        book_name = self.get_book_name()
        self.parse_html()
        self.show_chart(book_name)
        self.show_words(book_name)

# 实例化并调用
douban = DouBanshorts()
douban.super()

`

posted @ 2020-02-11 22:53  luoxian  阅读(386)  评论(0编辑  收藏  举报