基于scrapy框架爬取51job网站的python岗位并生成execl
请求传参- 51job 案例 (原本要写Boss,改成51了,创建项目时的名称没改)
-
在某些情况下,我们爬取的数据不在同一个页面中,例如,我们爬取一个电影网站,电影的名称,评分在一级页面,而要爬取的其他电影详情在其二级子页面中。这时我们就需要用到请求传参。
-
请求传参的使用场景
- 当我们使用爬虫爬取的数据没有存在于同一张页面的时候,则必须使用请求传参
-
scrapy基本使用:
- 创建工程:
- scrapy startproject Boss
- 进入工程目录:
- cd Boss
- 在spiders子目录中创建爬虫文件:
- scrapy genspider boss www.xxx.com
- 编写代码
- 执行工程(文件名):
- scrapy crawl boss 执行文件
- scrapy crawl boss --nolog (不看日志) 还有一种或者在配置文件中增加配置LOG_LEVEL = 'ERROR'
- 创建工程:
下面时基于管道的持久化存储 代码:
-
spiders/boss.py
-
# -*- coding: utf-8 -*- import scrapy from ..items import BossItem class BossSpider(scrapy.Spider): name = 'boss' # allowed_domains = ['www.zhipin.com'] #51 start_urls = ['https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?&workyear=02'] #分页 url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,%s.html?&workyear=02' page_num = 2 #详情页 def parse_detail(self, response): item = response.meta['item'] job_desc = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div//text()').extract() job_desc = ''.join(job_desc) #以'' 将字符串拼接在一起 item['job_desc'] = job_desc.strip() yield item #提交item到管道文件(pipelines.py) # 解析首页中的岗位名称 def parse(self, response): div_list = response.xpath('//*[@id="resultList"]/div[@class="el"]') for div in div_list: item = BossItem() job_name = div.xpath('./p/span/a/text()').extract_first() #[xx,] 取一个 .extract_first() # print(job_name) job_company = div.xpath('./span[1]/a/text()').extract_first() job_city = div.xpath('./span[2]/text()').extract_first() job_saray = div.xpath('./span[3]/text()').extract_first() job_pubtime = div.xpath('./span[4]/text()').extract_first() #封装到item对象 item['job_name'] = job_name.strip() item['job_company'] = job_company item['job_city'] = job_city item['job_saray'] = job_saray item['job_pubtime'] = job_pubtime #获取详情页href detail_url= div.xpath('./p/span/a/@href').extract_first() # print(detail_url) #对发请求 手动请求发送 请求传参meta yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item}) #分页爬取 if self.page_num <= 5: new_url = format(self.url % (self.page_num)) self.page_num += 1 yield scrapy.Request(url=new_url,callback=self.parse) #回调parse解析
-
items.py
-
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class BossItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() job_name = scrapy.Field() # 岗位 job_company = scrapy.Field() # 公司 job_city = scrapy.Field() #城市 job_saray = scrapy.Field() # 薪资 job_pubtime = scrapy.Field() # 发布时间 job_desc = scrapy.Field() # 职位信息
-
pipelines.py 管道
-
# -*- coding: utf-8 -*- # Define your item pipelines here # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from openpyxl import Workbook from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font,NamedStyle class BossPipeline(object): # 构造方法 def __init__(self): pass def open_spider(self,spider): self.wb = Workbook() # 实例化 self.ws = self.wb.active # 激活 worksheet self.ws.append(['职位名', '公司', '城市', '薪资', '发布时间', '职位描述']) ###设置单元格颜色 self.li = ['A', 'B', 'C', 'D', 'E'] highlight = NamedStyle(name="highlight") highlight.fill = PatternFill("solid", fgColor="66CD00") # 背景填充 for i in self.li: self.ws.column_dimensions[i].width = 30 # 循环列设置行宽 self.ws["%s1" % i].style = highlight # 设置单元格颜色 第一行 def process_item(self, item, spider): # print('职位名:',item["job_name"]) # print('公司:',item["job_company"]) # print('城市:',item["job_city"]) # print('薪资:',item["job_saray"]) # print('发布时间:',item["job_pubtime"]) # print('职位描述:',item["job_desc"]) #neirong desc= [item["job_name"],item["job_company"],item["job_city"],item["job_saray"],item["job_pubtime"],item["job_desc"]] print(item["job_name"]) #写入文件execl self.ws.append(desc) return item #返回给其他的管道类用于持久化存储 def close_spider(self,spider): self.wb.save('./51_python_job.xlsx')
-
settings.py 开始管道
-
#管道 ITEM_PIPELINES = { 'Boss.pipelines.BossPipeline': 300,}
爬取的数据如下: