scrapy实现自动抓取51job并分别保存到redis,mongo和mysql数据库中
项目简介
利用scrapy抓取51job上的python招聘信息,关键词为“python”,范围:全国
利用redis的set数据类型保存抓取过的url,现实避免重复抓取;
利用脚本实现每隔一段时间,网站更新后自动抓取;
利用mongo和mysql,分别保存抓取结果。
主要内容
网站分析
进入51job后,输入关键字python,搜索范围改为全国,通过分析得到该网页为静态网页
搜索后生成的url即为开始抓取的url:https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html
明确抓取字段
编写items.py文件,明确要抓取的字段
import scrapy class QcItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 数据来源 source = scrapy.Field() # 抓取时间 utc_time = scrapy.Field() # 职位名称 work_position = scrapy.Field() # 公司名称 name_company = scrapy.Field() # 工作地点 work_place = scrapy.Field() # 薪资范围 salary = scrapy.Field() # 发布时间 publish_time = scrapy.Field() # 工作详情 content = scrapy.Field() # 联系方式 contact = scrapy.Field()
编写爬虫文件
来到爬虫文件后,考虑给每一个请求添加一个请求头信息,因此,在下载中间件中添加请求头中间件
class QcSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. def process_request(self, request, spider): """ 给每一个请求随机分配一个代理 :param request: :param spider: :return: """ user_agent = random.choice(ua) request.headers['User-Agent'] = user_agent
添加请求头后,来到爬虫文件,编写parse函数,解析数据:
class QcSpider(scrapy.Spider): name = 'qc' # allowed_domains = ['51job.com'] # 开始url start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html'] def parse(self, response): # 先编写下载中间件,给每个请求加一个User-Agent # 解析数据 node_list = response.xpath('//div[@class="el"]') for node in node_list: # 匹配详情页链接,观察51job发现前面4个节点不是招聘信息,因此也没有详情页 # 因此,取不到详情页链接,表示可以忽略,不用存 detail_link = node.xpath('./p/span/a/@href') if detail_link: item = QcItem() item['work_position'] = node.xpath('./p/span/a/@title').extract_first() item['name_company'] = node.xpath('./span[@class="t2"]/a/text()').extract_first() item['work_place'] = node.xpath('./span[@class="t3"]/text()').extract_first() item['salary'] = node.xpath('./span[@class="t4"]/text()').extract_first() item['publish_time'] = node.xpath('./span[@class="t5"]/text()').extract_first() # 解析详情页数据 yield scrapy.Request(detail_link.extract_first(), callback=self.parse_detail, meta={"item": item})
在开始解析详情页数据之前,中下载中间件中,搭建redis,利用redis的set数据类型,将每一个详情页的链接添加到数据库中;
实现避免重复抓取,如果详情页的url在redis中,则忽略该次请求。
class QcRedisMiddleware(object): """ 将第一个页面上的每一个url放入redis的set类型中,防止重复爬取 """ # 连接redis def __init__(self): self.redis = redis.StrictRedis(host='localhost', port=6379, db=1) def process_request(self, request, spider): # 将来自详情页的链接存到redis中 if request.url.startswith("https://jobs.51job.com/"): # MD5加密详情页链接 url_md5 = hashlib.md5(request.url.encode()).hexdigest() # 添加到redis,添加成功返回True,否则返回False result = self.redis.sadd('qc_url', url_md5) # 添加失败,说明链接已爬取,忽略该请求 if not result: raise IgnoreRequest
继续来到爬虫文件中,编写详情页数据解析的内容。
def parse_detail(self, response): item = response.meta['item'] # 编写下载中间件,将详情页链接存到redis中,达到去重复的目的 # 解析页面所有数据 content = response.xpath('//div[@class="bmsg job_msg inbox"]').xpath('string(.)').extract() # content = response.xpath('//div[@class="bmsg job_msg inbox"]/*/text()').extract() # 取联系方式 contact = response.xpath('//div[@class="bmsg inbox"]/p/text()').extract() # 拿到的content有空格和换行符,利用正则,去掉空白符 item['content'] = re.sub('\s', '', ''.join(content)) item['contact'] = ''.join(contact).strip() yield item
此时,索要解析的数据均解析完毕,接下来就是将解析的数据进行保存。
数据保存
编写pipelines.py文件,保存item。利用mongo和mysql两种方式分别保存数据。
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json import pymongo import pymysql from datetime import datetime class QcPipeline(object): def process_item(self, item, spider): # 添加数据源 item['source'] = spider.name # 添加爬取时间 item['utc_time'] = str(datetime.utcnow()) return item class QcJsonPipeline(object): """ 保存为json数据 """ def open_spider(self, spider): # 打开文件 self.file = open('qc.json', 'a', encoding='utf-8') def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) + '\n' self.file.write(content) return item def close_spider(self, spider): self.file.close() class QcMongoPipeline(object): """ 存入大Mongodb中 """ def open_spider(self, spider): # 实例化mongo客户端并链接 self.client = pymongo.MongoClient(host='localhost', port=27017) # 创建库和集合 self.collection = self.client['qc']['qc'] def process_item(self, item, spider): # 添加数据 self.collection.insert(dict(item)) return item def close_spider(self, spider): # 关闭数据库 self.client.close() class QcMysqlPipeline(object): """ 数据存入到mysql """ def open_spider(self, spider): self.conn = pymysql.connect( host='localhost', port=3306, database='qc', user='z', password='136833', charset='utf8' ) # 实例一个游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): sql = ("insert into qc(source, utcTime, workName, " "company, workPosition, salary, publishTime, " "content, contact)" "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)") list_item = [item['source'], item['utc_time'], item['work_position'], item['name_company'], item['work_place'], item['salary'], item['publish_time'], item['content'], item['contact']] self.cursor.execute(sql, list_item) # 提交数据 self.conn.commit() return item def close_spider(self, spider): self.cursor.close() self.conn.close() # create table qc # ( # id INT unsigned PRIMARY KEY auto_increment NOT NULL, # source VARCHAR(20) DEFAULT "", # utcTime DATETIME DEFAULT "1111-11-11 11:11:11", # workName VARCHAR(40) DEFAULT "", # company VARCHAR(40) DEFAULT "", # workPosition VARCHAR(40) DEFAULT "", # salary VARCHAR(40) DEFAULT "", # publishTime VARCHAR(20) DEFAULT "", # content TEXT(1024), # contact VARCHAR(40) DEFAULT "" # );
自动抓取
最后实现自动爬取,单独编写一个脚本文件,隔一段时间自动抓取。
from scrapy import cmdline import time # cmdline.execute("scrapy crawl qc".split()) import os import time while True: """ 每隔10s自动爬取一次,实现自动更新 """ os.system("scrapy crawl qc") time.sleep(20)
完整代码
参见:https://github.com/zInPython/qiancheng