使用python中的requests模块和re模块爬取豆瓣图书信息,并且使用matplotlib模块以及pandas模块分析绘制图书评分分布情况

  • 使用python模块requests和re模块提取豆瓣图书信息
  • 获得数据后再使用matplotlib模块中的pyplot以及pandas模块对图书信息进行处理并绘制图
import requests
import re

class DouBanBookSpider(object):
    """豆瓣图书爬虫"""

    def __init__(self):
        self.url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T"
        self.cookies = """ll="108304"; bid=R6Pt4oTxZi8; _vwo_uuid_v2=D7A53E36D669FBBB91A373A6A66501916|90ed43ffd4cc7e306714b0e17bb6abe1; __utmz=30149280.1588666895.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; __utma=30149280.1990509793.1588598310.1588601953.1588666895.3; __utmt=1; dbcl2="154205275:TJys/10xBNM"; ck=luz-; __utmt_douban=1; __utmc=81379588; __utma=81379588.192167986.1588666947.1588666947.1588666947.1; __utmz=81379588.1588666947.1.1.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; gr_user_id=0188176d-b263-4c37-a574-475ea719d4f8; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=32e1b649-e9db-4370-bcd7-a67f4f9b64d6; gr_cs1_32e1b649-e9db-4370-bcd7-a67f4f9b64d6=user_id%3A1; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1588666947%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fbook.douban.com%252F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_32e1b649-e9db-4370-bcd7-a67f4f9b64d6=true; __yadk_uid=cUz687nYvCDbnaYMwl7wHOJWvxhllC7h; __gads=ID=cc9d743007559993:T=1588666950:S=ALNI_MYzY1BkdckBD1ZdHdqirBbTlqnRxA; push_noty_num=0; push_doumail_num=0; __utmb=30149280.4.10.1588666895; __utmb=81379588.3.10.1588666947; _pk_id.100001.3ac3=0b04a4ce0ca8175d.1588666947.1.1588666983.1588666947."""
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
            "Cookie":self.cookies
        }

    def create_url(self):
        """构造url地址"""
        return [self.url.format(i*20) for i in range(100)]


    def run(self):
        # 创建保存数据的csv文件
        with open("./books.csv","a+",encoding="utf-8") as f:
            f.writelines("ID,Name,Rat,Pin,,Raise\n")
        # 构造url地址
        url_list = self.create_url()
        # 模拟浏览器发送请求,获取响应
        for i in url_list:
            responses =  requests.get(i,headers=self.headers)
            print(responses.content.decode())
            data = responses.content.decode()
            # 数据的提取和保存
            self.extract_data(data)

    def extract_data(self,data):
        patter = "subject_id:\'(.*?)\'"
        book_ids = list(set(re.compile(patter).findall(data)[1:]))
        book_id = []
        book_names = []
        # print(book_id)
        for i in book_ids:
            book_id = i
            book_name_patter = "<a href=\"https://book.douban.com/subject/{}/\" title=\"(.*?)\"".format(i)
            book_name =  re.compile(book_name_patter).findall(data)
            book_names.append(book_name)

        book_pin_num_patter = """    <span class="pl">
        (.*?)
    </span>"""
        book_rat_patter = "<span class=\"rating_nums\">(.*?)</span>"
        book_raise_patter = """<div class="pub">
        
  
  (.*?)

      </div>"""

        book_rat = re.compile(book_rat_patter).findall(data)
        book_pin_num = re.compile(book_pin_num_patter).findall(data,re.S)
        book_raise = re.compile(book_raise_patter).findall(data,re.S)
        print("图书id:{} 图书名称:{} 图书评分:{} 图书评价人数:{} 图书价格:{}".format(book_ids,book_names,book_rat,book_pin_num,book_raise))
        # num = 0


        with open("./books.csv","a+",encoding="utf-8") as f:
            # if num ==0:
            #     f.writelines("ID,Name,Rat,Pin,,Raise\n")
            #     num+=1
            # else:
            content = [book_ids[i]+","+book_names[i][0]+","+book_rat[i]+",+"+book_pin_num[i]+","+book_raise[i]+"\n" for i in range(len(book_ids))]
            print(content)
            [f.writelines(i) for i in content]




def main():
    dou_ban_spider = DouBanBookSpider()
    dou_ban_spider.run()


if __name__ == '__main__':
    main()

 


 

  • 第二部分,使用Pandas模块提取图书评分信息,然后使用pyplot进行绘图

 

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
file_path = "./books.csv"
df = pd.read_csv(file_path)

# 获取所有电影评分
rat_all_list = df["Rat"]
# 评分最低分
min_rat = min(rat_all_list)
# 评分最高分
max_rat = max(rat_all_list)
# 最低分与最高分的差值
dis = max_rat - min_rat
# 直方图的组距
interval = 0.5
# 将图形分为多少组进行呈现
num_bins = int(dis//interval)

# 绘制直方图
font = {
    "family":"Microsoft YaHei",
    "weight":"bold",
    "size":15
}
# 设置中文字体
matplotlib.rc('font',**font)

plt.figure(figsize=(20,8),dpi=100)
plt.hist(rat_all_list,num_bins,color='orange')
plt.grid(alpha=0.5,color='r')
x_ticks = [min_rat]
index = min_rat
while index< max_rat + dis/num_bins:
    index += dis/num_bins
    x_ticks.append(index)

plt.xticks(x_ticks)
plt.xlabel("图书评分")
plt.ylabel("图书的数量")
plt.title("豆瓣1000本小说的评分分布情况")
plt.show()


 

 

 

posted @ 2020-05-05 20:08  一支梨花压海棠  阅读(492)  评论(0编辑  收藏  举报