爬取智联招聘
创建项目
wljdeMacBook-Pro:PycharmProjects wlj$ scrapy startproject zhilianzhaopin
New Scrapy project 'zhilianzhaopin', using template directory '/usr/local/lib/python3.6/site-packages/scrapy/templates/project', created in: /Users/wlj/work/PycharmProjects/zhilianzhaopin
You can start your first spider with: cd zhilianzhaopin scrapy genspider example example.com
wljdeMacBook-Pro:PycharmProjects wlj$ scrapy genspider --list Available templates: basic crawl csvfeed xmlfeed wljdeMacBook-Pro:PycharmProjects wlj$ scrapy genspider -t crawl zhaopin sou.zhaopin.com Created spider 'zhaopin' using template 'crawl' wljdeMacBook-Pro:PycharmProjects wlj$
items.py
1 import scrapy 2 3 class ZhilianzhaopinItem(scrapy.Item): 4 # define the fields for your item here like: 5 # name = scrapy.Field() 6 # 职位 7 position = scrapy.Field() 8 # 公司 9 company = scrapy.Field() 10 # 职位描述 11 position_1 = scrapy.Field() 12 # 公司介绍 13 company_1 = scrapy.Field()
zhaopin.py
1 port scrapy 2 from scrapy.linkextractors import LinkExtractor 3 from scrapy.spiders import CrawlSpider, Rule 4 from zhilianzhaopin.items import ZhilianzhaopinItem 5 6 class ZhaopinSpider(CrawlSpider): 7 name = 'zhaopin' 8 allowed_domains = ['sou.zhaopin.com'] 9 start_urls = ['https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%8C%97%E4%BA%AC&kw=python&p=1'] 10 11 rules = ( 12 Rule(LinkExtractor(allow=r'python&p=\d+'),follow = True), 13 Rule(LinkExtractor(allow=r'\d+.htm'), callback = 'parse_item'), 14 ) 15 def parse_item(self, response): 16 i = ZhilianzhaopinItem() 17 #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() 18 #i['name'] = response.xpath('//div[@id="name"]').extract() 19 #i['description'] = response.xpath('//div[@id="description"]').extract() 20 21 # 职位 22 i['position'] = response.xpath('//div[@class="fixed-inner-box"]//h1/text()').extract()[0] 23 # 公司 24 i['company'] = response.xpath('//div[@class="fixed-inner-box"]//a/text()').extract()[0].strip() 25 # 职位描述 26 i['position_1'] = response.xpath('//div[@class="tab-inner-cont"]/p/text()').extract() 27 # 公司介绍 28 i['company_1'] = response.xpath('//div[@class="tab-inner-cont"]/div/div/text()').extract()[0].strip() 29 30 yield i
pipelines.py
1 import json 2 3 class ZhilianzhaopinPipeline(object): 4 def __init__(self): 5 self.filename = open("zhilian.json", "w") 6 7 def process_item(self, item, spider): 8 text = json.dumps(dict(item), ensure_ascii = False) + ",\n" 9 self.filename.write(text) 10 11 return item 12 def close_spider(self, spider): 13 self.filename.close()
settings.py
1 BOT_NAME = 'zhilianzhaopin' 2 3 SPIDER_MODULES = ['zhilianzhaopin.spiders'] 4 NEWSPIDER_MODULE = 'zhilianzhaopin.spiders' 5 6 7 8 # Obey robots.txt rules 9 ROBOTSTXT_OBEY = False 10 11 ITEM_PIPELINES = { 12 'zhilianzhaopin.pipelines.ZhilianzhaopinPipeline': 300, 13 } 14 15 LOG_FILE = "dg.log" 16 LOG_LEVEL = "DEBUG"
执行
scrapy crawl zhaopin