scrapy爬虫中如何实现翻页请求
通过scrapy.Request实现翻页请求:
scrapy.Request(url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None, flags=None)
这里一爬取腾讯招聘网站的岗位信息为例制作一个爬虫进行翻页请求的实现
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 5 class HrSpider(scrapy.Spider): 6 name = 'Hr' 7 allowed_domains = ['tencent.com'] 8 start_urls = ['https://hr.tencent.com/position.php'] 9 10 def parse(self, response): 11 tr_list=response.xpath("//table[@class='tablelist']/tr")[1:-1] 12 for tr in tr_list: 13 item={} 14 item["title"]=tr.xpath("./td[1]/a/text()").extract_first() 15 item["postion"]=tr.xpath("./td[2]/text()").extract_first() 16 item["publish_date"]=tr.xpath("./td[5]/text()").extract_first() 17 yield item 18 #找到下一页的URL地址,实现翻页请求 19 next_url=response.xpath("//a[@id='next']/@href").extract_first() 20 if next_url !=" javascript:;": 21 next_url="https://hr.tencent.com/"+next_url 22 yield scrapy.Request( 23 next_url, 24 callback=self.parse 25 )