spider
# -*- coding: utf-8 -*- import scrapy from Jobs.items import JobsItem class Job51spiderSpider(scrapy.Spider): name = 'Job51Spider' allowed_domains = ['www.51job.com', 'search.51job.com'] offset = 1 # 起始url url = "https://search.51job.com/list/090200,000000,0000,00,9,99,php,2," start_urls = [url + str(offset) + ".html"] def parse(self, response): print(response.url) for each in response.css('#resultList .el:not(.title)'): # 初始化模型对象 item = JobsItem() # 职位名 item['zwname'] = each.css('.t1 a').xpath('./@title').extract_first() # 公司名字 item['gsname'] = each.css('.t2 a').xpath('./@title').extract_first() # 工作地点 item['gzdd'] = each.css('.t3::text').extract_first() # 工资 item['gz'] = each.css('.t4::text').extract_first() # 发布时间 item['fbtime'] = each.css('.t5::text').extract_first() yield item zong = response.xpath('//div[@class="dw_page"]/div/div/div/span/text()').extract_first().split('页')[0].strip('共') if self.offset < int(zong): self.offset += 1 # import ipdb; ipdb.set_trace() ss = self.url + str(self.offset) + ".html" yield scrapy.Request(url=ss, callback=self.parse)
items
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class JobsItem(scrapy.Item): # 职位名 zwname = scrapy.Field() # 公司名字 gsname = scrapy.Field() # 工作地点 gzdd = scrapy.Field() # 工资 gz = scrapy.Field() # 发布时间 fbtime = scrapy.Field()