爬取腾讯招聘

scrapy startproject insist  #创建项目
scrapy  genspider  teng  carees.tencent.com#创建爬虫(爬虫名字+域名)

items.py
#需要爬取的信息
import scrapy
class InsistItem(scrapy.Item):
    # define the fields for your item here like:
    positionname = scrapy.Field()
    type=scrapy.Field()
    place=scrapy.Field()
    mian=scrapy.Field()
    time=scrapy.Field()
    #pass


pipelines.py
#保存数据到数据库或者json文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import json

class InsistPipeline(object):
    def __init__(self):
        self.f=open('teng.json','w',encoding='utf-8')#编码
    def process_item(self, item, spider):
        #item(Item对象,被爬取的item)
        #这个方法必须实现
        content=json.dumps(dict(item),ensure_ascii=False)+",\n"
        self.f.write(content)
        return item

teng.py
import scrapy
import json
from insist.items import InsistItem

class TengSpider(scrapy.Spider):
    name = 'teng'
    allowed_domains = ['careers.tencent.com']
    #start_urls = ['http://careers.tencent.com/']
    baseURL = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
    offset = 1
    start_urls = [baseURL + str(offset)]

    def parse(self, response):
        contents=json.loads(response.text)
        jobs=contents['Data']['Posts']
        item=InsistItem()
        for job in jobs:
            item['positionname']=job['RecruitPostName']
            item['type']=job['BGName']
            item['place']=job['LocationName']
            item['mian']=job['CategoryName']
            item['time']=job['LastUpdateTime']
            yield item
        if self.offset<=10:
                 self.offset += 1
                 yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)

 

posted @ 2019-09-20 08:36  晨曦yd  阅读(482)  评论(0编辑  收藏  举报