scrapy爬取腾讯招聘信息
创建项目
scrapy startproject tencent
编写items.py
写class TencentItem
1 import scrapy 2 3 class TencentItem(scrapy.Item): 4 # define the fields for your item here like: 5 # 职位名 6 positionname = scrapy.Field() 7 # 详情连接 8 positionlink = scrapy.Field() 9 # 职位类别 10 positionType = scrapy.Field() 11 # 招聘人数 12 peopleNum = scrapy.Field() 13 # 工作地点 14 workLocation = scrapy.Field() 15 # 发布时间 16 publishTime = scrapy.Field()
创建基础类的爬虫
scrapy genspider tencentPosition"tencent.com"
tencentPosition.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from tencent.items import TencentItem 4 5 class TencentpositionSpider(scrapy.Spider): 6 name = "tencent" 7 allowed_domains = ["tencent.com"] 8 9 url = "http://hr.tencent.com/position.php?&start=" 10 offset = 0 11 12 start_urls = [url + str(offset)] 13 14 def parse(self, response): 15 for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): 16 # 初始化模型对象 17 item = TencentItem() 18 19 item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] 20 # 详情连接 21 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] 22 # 职位类别 23 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] 24 # 招聘人数 25 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] 26 # 工作地点 27 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] 28 # 发布时间 29 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] 30 31 yield item 32 33 if self.offset < 1680: 34 self.offset += 10 35 36 # 每次处理完一页的数据之后,重新发送下一页页面请求 37 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response 38 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
管道文件
pipelines.py
1 import json 2 3 class TencentPipeline(object): 4 def __init__(self): 5 self.filename = open("tencent.json", "w") 6 7 def process_item(self, item, spider): 8 text = json.dumps(dict(item), ensure_ascii = False) + ",\n" 9 self.filename.write(text.encode("utf-8")) 10 return item 11 12 def close_spider(self, spider): 13 self.filename.close()
在settings文件设置pipelines
ITEM_PIPELINES = {
'tencent.pipelines.TencentPipeline': 300,
}
添加请求报头
DEFAULT_REQUEST_HEADERS
settings.py
BOT_NAME = 'tencent' SPIDER_MODULES = ['tencent.spiders'] NEWSPIDER_MODULE = 'tencent.spiders' ROBOTSTXT_OBEY = True DOWNLOAD_DELAY = 2 DEFAULT_REQUEST_HEADERS = { "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' } ITEM_PIPELINES = { 'tencent.pipelines.TencentPipeline': 300, }