Spider -- MySQL数据库 之 增量爬取
# 思路 # 1、MySQL中新建表 urltab,存储所有爬取过的链接的指纹 # 2、在爬取之前,先判断该指纹是否爬取过,如果爬取过,则不再继续爬取
1、建库建表
# 建库建表 create database filmskydb charset utf8;
use filmskydb;
create table request_finger( finger char(32) )charset=utf8; create table filmtab( name varchar(200), download varchar(500) )charset=utf8;
2、完整代码
from urllib import request import re from useragents import ua_list # 自己个人写的模块,提供随机User-Agent import time import random import pymysql from hashlib import md5 class FilmSkySpider(object): def __init__(self): # 一级页面url地址 self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' self.db = pymysql.connect('localhost', 'root', '123456', 'filmskydb', charset='utf8') self.cursor = self.db.cursor() # 获取html功能函数 def get_html(self, url): headers = { 'User-Agent': random.choice(ua_list) } req = request.Request(url=url, headers=headers) res = request.urlopen(req) # 通过网站查看网页源码,查看网站charset='gb2312' # 如果遇到解码错误,识别不了一些字符,则 ignore 忽略掉 html = res.read().decode('gb2312', 'ignore') return html # 正则解析功能函数 def re_func(self, re_bds, html): pattern = re.compile(re_bds, re.S) r_list = pattern.findall(html) return r_list # 获取数据函数 def parse_page(self, one_url): html = self.get_html(one_url) re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">.*?</table>' # one_page_list: ['/html/xxx','/html/xxx','/html/xxx'] one_page_list = self.re_func(re_bds, html) for href in one_page_list: two_url = 'https://www.dytt8.net' + href # 生成指纹 - md5加密 s = md5() s.update(two_url.encode()) two_url_md5 = s.hexdigest() # 判断链接是否需要抓取 if self.is_go_on(two_url_md5): self.parse_two_page(two_url) # 爬取完成此链接后将指纹放到数据库表中 ins = 'insert into request_finger values(%s)' self.cursor.execute(ins, [two_url_md5]) self.db.commit() # uniform: 浮点数,爬取1个电影信息后sleep time.sleep(random.uniform(1, 3)) def is_go_on(self, two_url_md5): # 爬取之前先到数据库中查询比对 sel = 'select finger from request_finger where finger=%s' # 开始抓取之前,先来判断该链接之前是否抓取过 result = self.cursor.execute(sel, [two_url_md5]) if not result: return True # 解析二级页面数据 def parse_two_page(self, two_url): item = {} html = self.get_html(two_url) re_bds = r'<div class="title_all"><h1><font color=#07519a>(.*?)</font></h1></div>.*?<td style="WORD-WRAP.*?>.*?>(.*?)</a>' # two_page_list: [('名称1','ftp://xxxx.mkv')] two_page_list = self.re_func(re_bds, html) item['name'] = two_page_list[0][0].strip() item['download'] = two_page_list[0][1].strip() ins = 'insert into filmtab values(%s,%s)' film_list = [ item['name'], item['download'] ] self.cursor.execute(ins, film_list) self.db.commit() print(film_list) def main(self): for page in range(1, 201): one_url = self.url.format(page) self.parse_page(one_url) # uniform: 浮点数 time.sleep(random.uniform(1, 3)) if __name__ == '__main__': spider = FilmSkySpider() spider.main()