爬取腾讯招聘
scrapy startproject insist #创建项目 scrapy genspider teng carees.tencent.com#创建爬虫(爬虫名字+域名) items.py #需要爬取的信息 import scrapy class InsistItem(scrapy.Item): # define the fields for your item here like: positionname = scrapy.Field() type=scrapy.Field() place=scrapy.Field() mian=scrapy.Field() time=scrapy.Field() #pass pipelines.py #保存数据到数据库或者json文件 # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import json class InsistPipeline(object): def __init__(self): self.f=open('teng.json','w',encoding='utf-8')#编码 def process_item(self, item, spider): #item(Item对象,被爬取的item) #这个方法必须实现 content=json.dumps(dict(item),ensure_ascii=False)+",\n" self.f.write(content) return item teng.py import scrapy import json from insist.items import InsistItem class TengSpider(scrapy.Spider): name = 'teng' allowed_domains = ['careers.tencent.com'] #start_urls = ['http://careers.tencent.com/'] baseURL = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex=' offset = 1 start_urls = [baseURL + str(offset)] def parse(self, response): contents=json.loads(response.text) jobs=contents['Data']['Posts'] item=InsistItem() for job in jobs: item['positionname']=job['RecruitPostName'] item['type']=job['BGName'] item['place']=job['LocationName'] item['mian']=job['CategoryName'] item['time']=job['LastUpdateTime'] yield item if self.offset<=10: self.offset += 1 yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)