使用控制台抓包完成对博客网站的增量抓取

抓取目标:santos tang博客网站的评论
网址:http://www.santostang.com/

1.观察网页
image


点开页面,发现是一个专门记录网络爬虫的博客网站,点开具体文章,下拉发现评论部分
image




查看源代码,随便搜索一个评论,发现没有结果
image




我们接着考虑用F12开发者模式截取可能的数据包(chrome开发者模式更友好些)
image




下拉以后发现评论是由来必力平台加载的,fetch中传来了几个包,但是查看preview都没有信息, 于是选择ctrl+F直接搜索评论内容来定位数据
image




成功找到装有评论的json包
image




在headers里面找到了其申请的url地址,我们点开来细看
image




json内容很清晰
代码实现

import pymysql
import requests
import json
import random
import time
from Useragents import ua_list

# http://www.santostang.com/
class Spider(object):
    # 通过搜索评论内容快速获得评论数据包
    def __init__(self):
        self.url = 'https://api-zero.livere.com/v1/comments/list?callback=jQuery112408682037701495582_1714279915663&limit={}&repSeq=4290319&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=&_=1714279915665'
        self.db = pymysql.connect(
            host='localhost', user='root', password='123456',
            database='commentsdb', charset='utf8mb4'
        )
        self.cursor = self.db.cursor()
        self.comments = []

我们通过插入数据库,每次查询数据库的方式来判断是否有增量,提前在mysql内部建立数据库commentsdb,里面有comment和version两张表,comment有name和comment两个属性值,version每次存上次最新评论用于比对
功能函数

    def get_headers(self):
        headers = {'User-Agent': random.choice(ua_list)}
        return headers

    def get_json(self, url):
        json_string = requests.get(url=url, headers=self.get_headers()).text
        json_string = json_string[json_string.find('{'):-2]
        html_json = json.loads(json_string)

        return html_json

    def parse_json(self, html_json):
        comment_list = html_json["results"]['parents']
        print(comment_list)
        for reply in comment_list:
            user = reply["name"]
            content = reply["content"]

            self.comments.append([user, content])
    def insert_mysql(self):
        dele = 'delete from comments'
        self.cursor.execute(dele)

        ins = 'insert into comments values(%s, %s)'
        self.cursor.executemany(ins, self.comments)

解析json时注意将前面的typeof和后面的括号去掉,避免json.load出错,在version里比对最新一条评论,每次有增量的时候将原数据删除,重新爬取和插入并更新version, ua_list为自己搜集的UserAgents
主函数

    def search(self):
        url = self.url.format(1)
        html_json = self.get_json(url)
        first_comment = html_json["results"]['parents'][0]["content"]
        sel = 'select comment from version where comment=%s'
        result = self.cursor.execute(sel, [first_comment])
        if result:
            print("网站未更新")
        else:
            time.sleep(random.uniform(2, 3))
            url = self.url.format(50)
            html_json = self.get_json(url)
            self.parse_json(html_json)

            self.insert_mysql()
            dele = 'delete from version'
            ins = 'insert into version values(%s)'
            self.cursor.execute(dele)
            self.cursor.execute(ins, [first_comment])
            self.db.commit()

经过观察发现url中的limits属性决定显示多少条评论,目前抓取50条
全代码

import pymysql
import requests
import json
import random
import time
from Useragents import ua_list

# http://www.santostang.com/
class Spider(object):
    # 通过搜索评论内容快速获得评论数据包
    def __init__(self):
        self.url = 'https://api-zero.livere.com/v1/comments/list?callback=jQuery112408682037701495582_1714279915663&limit={}&repSeq=4290319&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=&_=1714279915665'
        self.db = pymysql.connect(
            host='localhost', user='root', password='123456',
            database='commentsdb', charset='utf8mb4'
        )
        self.cursor = self.db.cursor()
        self.comments = []

    def get_headers(self):
        headers = {'User-Agent': random.choice(ua_list)}
        return headers

    def get_json(self, url):
        json_string = requests.get(url=url, headers=self.get_headers()).text
        json_string = json_string[json_string.find('{'):-2]
        html_json = json.loads(json_string)

        return html_json

    def parse_json(self, html_json):
        comment_list = html_json["results"]['parents']
        print(comment_list)
        for reply in comment_list:
            user = reply["name"]
            content = reply["content"]

            self.comments.append([user, content])

    def search(self):
        url = self.url.format(1)
        html_json = self.get_json(url)
        first_comment = html_json["results"]['parents'][0]["content"]
        sel = 'select comment from version where comment=%s'
        result = self.cursor.execute(sel, [first_comment])
        if result:
            print("网站未更新")
        else:
            time.sleep(random.uniform(2, 3))
            url = self.url.format(50)
            html_json = self.get_json(url)
            self.parse_json(html_json)

            self.insert_mysql()
            dele = 'delete from version'
            ins = 'insert into version values(%s)'
            self.cursor.execute(dele)
            self.cursor.execute(ins, [first_comment])
            self.db.commit()

    def insert_mysql(self):
        dele = 'delete from comments'
        self.cursor.execute(dele)

        ins = 'insert into comments values(%s, %s)'
        self.cursor.executemany(ins, self.comments)

    def run(self):
        self.search()


if __name__ == "__main__":
    spider = Spider()
    spider.run()

注意:网站会监控访问量,若短时间内数次访问可能会只显示第一条评论,若出现此情况需要等几分钟,用print查看具体情况

posted @ 2024-04-28 14:40  mike2367  阅读(17)  评论(0)    收藏  举报