scrapy -- 请求传参

为什么要请求传参，使用的场景是什么呢？

什么是请求传参：就是在爬虫文件中，定义了多个回调函数时，却想传入同一个管道中，来进行数据存储
使用场景：当我们解析的数据不在同一个页面时，但又想把不同页面爬取到的数据传到一个管道里，就需要用到请求传参

案例：基于BOSS直聘网，爬取不同网页的数据

代码示例：

#1.爬虫文件.py代码示例：

import scrapy
from bossPro.items import BossproItem

class BossSpider(scrapy.Spider):
    name = 'boss'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position=']

    url = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
    page_num = 2

   #回调函数接受item
    def parse_detail(self,response):
        #通过response获取meta中的itme
        item = response.meta['item']

        job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
        job_desc = ''.join(job_desc)
        # print(job_desc)
        item['job_desc'] = job_desc

        yield item

    #解析首页中的岗位名称
    def parse(self, response):
        li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
        for li in li_list:
            item = BossproItem()

            job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div[1]/text()').extract_first()
            item['job_name'] = job_name
            # print(job_name)
            detail_url = 'https://www.zhipin.com'+li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first()
            #对详情页发请求获取详情页的页面源码数据
            #手动请求的发送
            #请求传参：meta={}，可以将meta字典传递给请求对应的回调函数
            yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})

        #分页操作
        if self.page_num <= 3:
            new_url = format(self.url%self.page_num)
            self.page_num += 1

            yield scrapy.Request(new_url,callback=self.parse)

#2.items.py文件代码示例：

import scrapy


class BossproItem(scrapy.Item):
    # define the fields for your item here like:
    job_name = scrapy.Field()
    job_desc = scrapy.Field()

#3.pipeline.py文件代码示例：

class BossproPipeline(object):
    def process_item(self, item, spider):
        print(item)
        return item

#settings.py文件中配置：配置文件这几个参数，后续不说的话，都这样配置就行了

#日志打印的等级
LOG_LEVEL = 'ERROR'

# UA伪装，自己去浏览器抓
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'


#该参数配置默认False
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
   'bossPro.pipelines.BossproPipeline': 300,
}