# -*- coding: utf-8 -*- import scrapy class HrSpider(scrapy.Spider): name = 'hr' allowed_domains = ['tencent.com'] start_urls = ['https://hr.tencent.com/position.php'] def parse(self, response): tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1] for tr in tr_list: item = {} item['title'] = tr.xpath("./td[1]/a/text()").extract_first() item['position'] = tr.xpath("./td[2]/text()").extract_first() item['pub_date'] = tr.xpath("./td[5]/text()").extract_first() yield item # 请求下一页 需要settings USER_AGENT next_url = response.xpath("//a[@id='next']/@href").extract_first() if next_url != "javascript:;": next_url = 'https://hr.tencent.com/' + next_url yield scrapy.Request( next_url, callback=self.parse # 调用处理方法 )
scrapy.Request(url, [callback, method='GET', headers, body, cookies, meta, dont_filter=False] ) callback:指定传入的url交给那个函数去解析 meta:实现在不同的解析函数中传递数据,meta默认携带部分数据,如下载延迟,请求深度等 dont_filter:让scarpy的去重不会过滤当前url,scarpy有默认的url去重功能,对需要重复请求的url需要设置为True