准备工作

  1. scrapy startproject Jobs
  2. cd Jobs
  3. scrapy genspider ZhaopinSpider www.zhaopin.com
  4. scrapy crawl ZhaopinSpider
  5. pip install diskcache
  6. pip install tinydb
  7. scrapy crawl ZhaopinSpider -o chongqing.json

 

ZhaopinSpider

 

# -*- coding: utf-8 -*-
import os
import json

from tinydb import TinyDB, Query
from furl import furl
import scrapy


class ZhaopinspiderSpider(scrapy.Spider):
    name = 'ZhaopinSpider'
    allowed_domains = ['www.zhaopin.com', 'sou.zhaopin.com', 'fe-api.zhaopin.com']
    start_urls = ['https://www.zhaopin.com/citymap']
    cache_db = TinyDB('ZhaopinSpider-cache.json')  # 缓存数据库
    allowed_cities = ['重庆', ]# '成都', '上海', '深圳', '昆明', '杭州', '贵阳', '宁波']  ## 允许的城市
    F = furl('https://fe-api.zhaopin.com/c/i/sou?pageSize=90&kt=3')  # URL母版
    PAGE_SIZE = 90  # 分页大小

    def get_city_code(self, city_name):
        '''(根据城市名)获取城市代码'''
        Q = Query()
        city = self.cache_db.get(Q.name.search(city_name))
        if isinstance(city, dict):
            return city['code']
        else:
            print('@' * 100)
            print(type(city))

    def init_city_info(self, response):
        '''初始化城市信息'''
        # 取源码
        script_text = response.xpath('//script[text()[contains(., "__INITIAL_STATE__")]]/text()').extract_first()
        # 去收尾空格
        script_text = script_text.strip()
        # 预处理为符合json规范的数据
        script_json = script_text[script_text.index('=') + 1:]
        # 将json字符串转为字典
        script_dict = json.loads(script_json)
        '''
        # 存储取得的json, 便于调试查看
        with open('text.json', 'wt', encoding='utf-8') as f:
            json.dump(script_dict, f, indent=4, ensure_ascii=False)
        '''
        '''
        city_list = []  # 存储城市列表
        # 将字典中的城市提取到列表中,便于查找
        for ch in script_dict['cityList']['cityMapList']:
            city_list.extend(script_dict['cityList']['cityMapList'][ch])
        # 筛选出重庆,并获取城市码
        city_code = (list(filter(lambda city: city['name'] == '重庆', city_list)) or [{'code': None}])[0]['code']
        '''
        for ch in script_dict['cityList']['cityMapList']:
            for city in script_dict['cityList']['cityMapList'][ch]:
                self.cache_db.insert(city)

    def parse(self, response):
        # if not os.path.exists('ZhaopinSpider-cache.json'):
        if not bool(self.eache_db.all()):
            self.init_city_info(response)
        # 迭代每一个要爬取的城市
        for city_name in self.allowed_cities:
            # 启动 爬取某个城市 第一个请求
            # import ipdb; ipdb.set_trace()
            yield self.request_city(city_name)

    def request_city(self, city_name, page_start=0):
        '''构造 爬取某个具体的城市 的请求对象'''
        city_code = self.get_city_code(city_name)
        url_data = {
            'cityId': city_code,
            'kw': 'python',
            'start': page_start
        }
        # 要爬取的页面的URL
        url = self.F.copy().add(url_data).url
        # import ipdb; ipdb.set_trace()
        req = scrapy.Request(url, callback=self.parse_city, dont_filter=False)
        # 使用 meta 传递附加数据,在 callback 中可以通过 respo.meta 取得
        req.meta['city_name'] = city_name
        req.meta['page_start'] = page_start
        return req

    def parse_city(self, response):
        '''解析具体的页面'''
        # 解析json格式的响应结果
        resp_dict = json.loads(response.body_as_unicode())
        # 总共所能爬取的条数
        num_found = resp_dict['data']['numFound']
        # 获取当前请求的 page_start
        page_start = response.meta['page_start']
        # 下一次请求,需要的 start 参数
        next_start = page_start + self.PAGE_SIZE
        # import ipdb; ipdb.set_trace()
        # 判断是否有下一页
        if next_start < num_found:
            # 获取当前请求的 城市名
            city_name = response.meta['city_name']
            # 发送下一页请求
            yield self.request_city(city_name, page_start=next_start)
        # 解析数据
        for item in resp_dict['data']['results']:
            # TODO: 解析数据,只取我们需要的信息
            item['spiderName'] = self.name
            # 返回每一条数据
            yield item

 

posted on 2019-01-11 17:42  若无未来  阅读(902)  评论(0编辑  收藏  举报