哔哩哔哩舞蹈区每日前100名内容-Spider

说明:无意滋生事端,仅学习分享,如有侵权,立即删除

用到的模块:json、lxml的etree、time.ctime、requests

源码如下:

import requests
import time
from lxml import etree
import json


class BiLiSpider():
    def __init__(self):
        self.url = 'https://www.bilibili.com/ranking/all/129/0/3?spm_id_from=333.851.b_62696c695f7265706f72745f64616e6365.39'
        self.headers = {
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
        }

    def get_response(self):
        ret = requests.get(self.url,headers=self.headers)
        ret = ret.content.decode()
        return ret

    def make_content(self,ret1):
        all_item_list = list() # 创建一个列表用于接受字典,并用来排名
        for ret_ in ret1:
            item = dict() # 创建一个字典用于接收每名的标题、得分、链接地址
            all_item = dict() # 创建一个字典用于保存作者,并关联排名
            item["title"] = ret_.xpath('.//div[@class="info"]/a/text()')[0] if len(ret_.xpath('.//div[@class="info"]/a/text()')) else None
            item['hot_score'] = ret_.xpath('.//div[@class="pts"]/div/text()')[0] if len(ret_.xpath('.//div[@class="pts"]/div/text()')) else None
            item["title_href"] = ret_.xpath('.//div[@class="info"]/a/@href')[0] if len(ret_.xpath('.//div[@class="info"]/a/@href')) else None
            author = ret_.xpath('.//div[@class="detail"]/a/span/text()')[0] if len(ret_.xpath('.//div[@class="detail"]/a/span/text()')) else None
            all_item["{}".format(author)] = item
            all_item_list.append(all_item)
        return all_item_list

    def save_file_response(self,ret):
        with open("spider_bilibil({}).html".format(time.ctime()),'w',encoding="utf8") as f:
            f.write(ret)
        print("保存响应内容成功")

    def save_file(self,ret):
        with open("哔哩哔哩舞蹈区前100名内容({}).json".format(time.ctime()),'w',encoding="utf-8") as f:
            f.write(json.dumps(ret,ensure_ascii=False,indent=2))
        print("保存哔哩哔哩舞蹈区前100名内容成功")

    def run(self):
        # 获取相应对象
        ret = self.get_response()
        # 保存response对象
        self.save_file_response(ret)
        # 实例化一个element对象
        html = etree.HTML(ret)
        # 对element对象分组
        ret1 = html.xpath("//ul[@class='rank-list']/li[@class='rank-item']")
        # 获取舞蹈区前100名的内容
        all_item = self.make_content(ret1)
        # 保存文件
        self.save_file(all_item)

        
if __name__ == '__main__':
    obj = BiLiSpider()
    obj.run()

 

posted @ 2020-04-24 21:01  Norni  阅读(235)  评论(0编辑  收藏  举报