使用控制台抓包完成对博客网站的增量抓取
抓取目标:santos tang博客网站的评论
网址:http://www.santostang.com/
1.观察网页

点开页面,发现是一个专门记录网络爬虫的博客网站,点开具体文章,下拉发现评论部分

查看源代码,随便搜索一个评论,发现没有结果

我们接着考虑用F12开发者模式截取可能的数据包(chrome开发者模式更友好些)

下拉以后发现评论是由来必力平台加载的,fetch中传来了几个包,但是查看preview都没有信息, 于是选择ctrl+F直接搜索评论内容来定位数据

成功找到装有评论的json包

在headers里面找到了其申请的url地址,我们点开来细看

json内容很清晰
代码实现
import pymysql
import requests
import json
import random
import time
from Useragents import ua_list
# http://www.santostang.com/
class Spider(object):
# 通过搜索评论内容快速获得评论数据包
def __init__(self):
self.url = 'https://api-zero.livere.com/v1/comments/list?callback=jQuery112408682037701495582_1714279915663&limit={}&repSeq=4290319&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=&_=1714279915665'
self.db = pymysql.connect(
host='localhost', user='root', password='123456',
database='commentsdb', charset='utf8mb4'
)
self.cursor = self.db.cursor()
self.comments = []
我们通过插入数据库,每次查询数据库的方式来判断是否有增量,提前在mysql内部建立数据库commentsdb,里面有comment和version两张表,comment有name和comment两个属性值,version每次存上次最新评论用于比对
功能函数
def get_headers(self):
headers = {'User-Agent': random.choice(ua_list)}
return headers
def get_json(self, url):
json_string = requests.get(url=url, headers=self.get_headers()).text
json_string = json_string[json_string.find('{'):-2]
html_json = json.loads(json_string)
return html_json
def parse_json(self, html_json):
comment_list = html_json["results"]['parents']
print(comment_list)
for reply in comment_list:
user = reply["name"]
content = reply["content"]
self.comments.append([user, content])
def insert_mysql(self):
dele = 'delete from comments'
self.cursor.execute(dele)
ins = 'insert into comments values(%s, %s)'
self.cursor.executemany(ins, self.comments)
解析json时注意将前面的typeof和后面的括号去掉,避免json.load出错,在version里比对最新一条评论,每次有增量的时候将原数据删除,重新爬取和插入并更新version, ua_list为自己搜集的UserAgents
主函数
def search(self):
url = self.url.format(1)
html_json = self.get_json(url)
first_comment = html_json["results"]['parents'][0]["content"]
sel = 'select comment from version where comment=%s'
result = self.cursor.execute(sel, [first_comment])
if result:
print("网站未更新")
else:
time.sleep(random.uniform(2, 3))
url = self.url.format(50)
html_json = self.get_json(url)
self.parse_json(html_json)
self.insert_mysql()
dele = 'delete from version'
ins = 'insert into version values(%s)'
self.cursor.execute(dele)
self.cursor.execute(ins, [first_comment])
self.db.commit()
经过观察发现url中的limits属性决定显示多少条评论,目前抓取50条
全代码
import pymysql
import requests
import json
import random
import time
from Useragents import ua_list
# http://www.santostang.com/
class Spider(object):
# 通过搜索评论内容快速获得评论数据包
def __init__(self):
self.url = 'https://api-zero.livere.com/v1/comments/list?callback=jQuery112408682037701495582_1714279915663&limit={}&repSeq=4290319&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=&_=1714279915665'
self.db = pymysql.connect(
host='localhost', user='root', password='123456',
database='commentsdb', charset='utf8mb4'
)
self.cursor = self.db.cursor()
self.comments = []
def get_headers(self):
headers = {'User-Agent': random.choice(ua_list)}
return headers
def get_json(self, url):
json_string = requests.get(url=url, headers=self.get_headers()).text
json_string = json_string[json_string.find('{'):-2]
html_json = json.loads(json_string)
return html_json
def parse_json(self, html_json):
comment_list = html_json["results"]['parents']
print(comment_list)
for reply in comment_list:
user = reply["name"]
content = reply["content"]
self.comments.append([user, content])
def search(self):
url = self.url.format(1)
html_json = self.get_json(url)
first_comment = html_json["results"]['parents'][0]["content"]
sel = 'select comment from version where comment=%s'
result = self.cursor.execute(sel, [first_comment])
if result:
print("网站未更新")
else:
time.sleep(random.uniform(2, 3))
url = self.url.format(50)
html_json = self.get_json(url)
self.parse_json(html_json)
self.insert_mysql()
dele = 'delete from version'
ins = 'insert into version values(%s)'
self.cursor.execute(dele)
self.cursor.execute(ins, [first_comment])
self.db.commit()
def insert_mysql(self):
dele = 'delete from comments'
self.cursor.execute(dele)
ins = 'insert into comments values(%s, %s)'
self.cursor.executemany(ins, self.comments)
def run(self):
self.search()
if __name__ == "__main__":
spider = Spider()
spider.run()
注意:网站会监控访问量,若短时间内数次访问可能会只显示第一条评论,若出现此情况需要等几分钟,用print查看具体情况
浙公网安备 33010602011771号