二级静态页面的爬取-----电影天堂
'''二级静态页面的爬取''' from urllib import request import re import time import random import pymysql class DianyingtiantangSpider: def __init__(self): self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' self.headers = {'User-Agent': random.choice([ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201' ])} self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='123456', db='dianyingdb', charset='utf8') self.cursor = self.db.cursor() # 获取html函数(因为两个页面都需要请求) def get_page(self, url): req = request.Request(url=url, headers=self.headers) res = request.urlopen(req) html = res.read().decode('gb2312', 'ignore') return html # 解析提取数据(把名称和下载链接一次性拿到) def parse_page(self, html): # 1.先解析一级页面(提取电影名称,和详情链接) pattern = re.compile('<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>', re.S) # film_list:[('详情链接','电影名称'),()] film_list = pattern.findall(html) # print(film_list) result_list = [] for film in film_list: film_name = film[1].strip() film_link = 'https://www.dytt8.net{}'.format(film[0].strip()) # print(film_link) download_link = self.parse_two_page(film_link) result_list.append([film_name, download_link]) self.save_page(result_list) def parse_two_page(self, film_link): two_html = self.get_page(film_link) pattern = re.compile('<td style="WORD-WRAP.*?>.*?>(.*?)</a>', re.S) download_link = pattern.findall(two_html) # print('你猜',download_link) return download_link[0].strip() # 2.拿到详情链接后,再去获取详情链接的html,提取下载链接 # 保存 def save_page(self, result_list): ins = 'insert into film values(%s,%s)' self.cursor.executemany(ins, result_list) self.db.commit() # 主函数 def main(self): ins = 'delete from film' self.cursor.execute(ins) self.db.commit() i = 1 for i in range(1, 5): url = self.url.format(i) html = self.get_page(url) self.parse_page(html) print('第{}页爬取成功'.format(i)) i += 1 time.sleep(random.randint(1, 3)) self.cursor.close() self.db.close() if __name__ == '__main__': start = time.time() spider = DianyingtiantangSpider() spider.main() end = time.time() print('程序执行时间为:%.2f' % (end - start))