Python 之scrapy框架58同城招聘爬取案例
一、项目目录结构:
代码如下:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class Job58CityItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() job_name = scrapy.Field() money = scrapy.Field() job_wel = scrapy.Field() company = scrapy.Field() position_type = scrapy.Field() xueli = scrapy.Field() jingyan = scrapy.Field() address = scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from ..items import Job58CityItem class JobsSpider(scrapy.Spider): name = 'jobs' allowed_domains = ['58.com'] # 配置起始页url offset = 1 url = "https://cd.58.com/job/pn{0}/" start_urls = [url.format(str(offset))] #解析html内容 def parse(self, response): for each in response.xpath("//ul[@id='list_con']/li"): item = Job58CityItem() item['job_name'] = each.xpath(".//span[@class='name']/text()").extract()[0] money_list = each.xpath(".//p[@class='job_salary']/text()").extract() money = "未知" if len(money_list) > 0: money = money_list[0] item['money'] = money span = each.xpath(".//div[@class='job_wel clearfix']/span") item['job_wel'] = [] for i in span: item['job_wel'].append(i.xpath("./text()").extract()[0]) item['company'] = each.xpath(".//div[@class='comp_name']/a/text()").extract()[0] item['position_type'] = each.xpath(".//span[@class='cate']/text()").extract()[0] item['xueli'] = each.xpath(".//span[@class='xueli']/text()").extract()[0] item['jingyan'] = each.xpath(".//span[@class='jingyan']/text()").extract()[0] item['address'] = each.xpath("//span[@class='address']/text()").extract()[0] yield item if self.offset < 100: self.offset += 1 yield scrapy.Request("https://cd.58.com/job/pn{0}/".format(str(self.offset)), callback=self.parse)
from scrapy import cmdline if __name__ == '__main__': cmdline.execute("scrapy crawl jobs".split())
数据: