items.py
import scrapy
class InsistItem(scrapy.Item):
positionname=scrapy.Field()
type=scrapy.Field()
place=scrapy.Field()
mian=scrapy.Field()
time=scrapy.Field()
pipelines.py
import json
import scrapy
import pymysql
from scrapy.pipelines.images import ImagesPipeline
class InsistPipeline(object):
def __init__(self):
self.db=pymysql.connect(host='localhost',user='dsuser',passwd='badpassword',db='dsdb',charset='utf8',port=3306)
self.cur=self.db.cursor()
def process_item(self, item, spider):
sql='INSERT INTO job(name,type,place,mian,time) VALUES(%s,%s,%s,%s,%s) '
self.cur.execute(sql,(item['positionname'],item['type'],item['place'],item['mian'],item['time']))
self.db.commit()
return item
def close_spider(self, spider):
self.cur.close()
self.db.close()
insisits.py
#爬虫程序
import scrapy
from insist.items import InsistItem
import json
class InsistsSpider(scrapy.Spider):
name = 'insists'
allowed_domains = ['careers.tencent.com']
#start_urls =['https://careers.tencent.com/search.html?index=']
baseURL='https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
offset=1
start_urls=[baseURL+str(offset)]
def parse(self, response):
contents = json.loads(response.text)
jobs = contents['Data']['Posts']
item = InsistItem()
for job in jobs:
item['positionname'] = job['RecruitPostName']
item['type'] = job['BGName']
item['place'] = job['LocationName']
item['mian'] = job['CategoryName']
item['time'] = job['LastUpdateTime']
yield item#返回后继续执行数据
if self.offset<=5:
self.offset+=1
url=self.baseURL+str(self.offset)
yield scrapy.Request(url,callback=self.parse)