- 使用scrapy框架爬取前程无忧上的python职位
- 创建cmd文件:star.cmd
scrapy startproject Jobs
cd Jobs
scrapy genspider Job51Spider www.51job.com
- 使用编译器打开Jobs开始项目
- 打开/spiders/Job51Spider.py 写入
# -*- coding: utf-8 -*-
import json
import re
import time
from scrapy import Spider, Request
import requests
from Jobs.items import Job51Item
class Job51spiderSpider(Spider):
name = 'Job51Spider'
allowed_domains = ['www.51job.com']
start_urls = ['http://www.51job.com/']
# 配置搜索城市, 和搜索关键字
kw = 'python'
sou_url = 'https://search.51job.com/list/{city_code},000000,0000,00,9,99,{kw},2,1.html'
# 城市编号js
city_codings_url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20180319'
def start_requests(self):
# 获取循环城市
cities = self.get_url_citycods()
forcity = list(cities)[:2] # 这里切割前两个城市
for city in forcity:
yield Request(
self.sou_url.format(city_code=cities[city], kw=self.kw),
callback=self.parse_jobs,
meta={'city': city}
)
def parse_jobs(self, response):
city = response.meta['city']
els = response.css('.dw_table .el')[1:]
# import ipdb; ipdb.set_trace()
for el in els:
item = Job51Item()
item['soucity'] = city
item['pname'] = el.css('span a::text').extract_first().strip()
item['purl'] = el.css('span a::attr(href)').extract_first().strip()
item['cname'] = el.css('span.t2 a::text').extract_first().strip()
item['curl'] = el.css('span.t2 a::attr(href)').extract_first().strip()
item['address'] = el.css('span.t3::text').extract_first().strip()
item['pay'] = el.css('span.t4::text').extract_first()
item['retime'] = el.css('span.t5::text').extract_first().strip()
yield item
next_page = response.css('.bk a::text')[-1].extract().strip()
# import ipdb;ipdb.set_trace()
if next_page == '下一页':
next_url = response.css('.bk a::attr(href)')[-1].extract().strip()
yield Request(url=next_url, callback=self.parse_jobs, dont_filter=True, meta={'city': city})
# 获取城市编号
def get_url_citycods(self):
area_text = requests.get(self.city_codings_url).text
ss = re.search('(\{.*\}).*?', area_text, re.S)
st = ss.group()
st_dict = json.loads(st)
# 键值调换
in_dict = {}
# for k in st_dict:
# in_dict[st_dict[k]] = k
# with open('data.json', 'wt', encoding='utf-8') as fs:
# json.dump(in_dict, fs, indent=4, ensure_ascii=False)
# # 获取主要城市
in_dict.clear()
for k in st_dict:
if k.find('0000') not in [-1]:
in_dict[st_dict[k]] = k
with open('city_big.json', 'wt', encoding='utf-8') as fs:
json.dump(in_dict, fs, indent=4, ensure_ascii=False)
return in_dict
def parse(self, response):
pass
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Item, Field
class Job51Item(Item):
# define the fields for your item here like:
# name = scrapy.Field()
soucity = Field()
# 职位名
pname = Field()
# 职位地址
purl = Field()
# 公司名
cname = Field()
# 公司地址
curl = Field()
# 工作地点
address = Field()
# 工资
pay = Field()
# 发布时间
retime = Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class Job51Pipeline(object):
job51s = 'job51'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.job51s].insert_one(dict(item))
return item
FEED_EXPORT_ENCODING = 'utf-8'
MONGO_URI = 'localhost'
MONGO_DB = 'jobsconnection'
仅供参考学习