scrapy 爬取拉勾网
一、模板使用
scrapy 在建立爬虫的时候,还可以指定使用的模板进行建立
默认建立爬虫文件的命令:
scrapy genspider 爬虫名称 爬虫地址
可以用 scrapy genspider --list 命令 查看scrapy的模板
$ scrapy genspider --list
Available templates:
basic
crawl
csvfeed
xmlfeed
通过crawl模板生成拉钩网爬虫文件
$ scrapy genspider -t crawl lagou www.lagou.com Created spider 'lagou' using template 'crawl' in module: ArticleSpider.spiders.lagou
二、编写lagou.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import LagouJobItemLoader, LagouJobItem from ..utils.common import get_md5 from datetime import datetime class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] rules = ( # Rule(LinkExtractor(allow=r'zhaopin/.*'), follow=True), # Rule(LinkExtractor(allow=r'gongsi/j\d+.html'), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) # 这个位置可以对这两个函数进行扩展 # def parse_start_url(self, response): # return [] # # def process_results(self, response, results): # return results def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css('title', '.job-name::attr(title)') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.job_request .salary::text') item_loader.add_xpath('job_city', '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath('work_years', '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath('degree_need', '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath('job_type', '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css('tags', '.position-label li::text') item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '#job_company dt a img::attr(alt)') item_loader.add_css('company_url', '#job_company dt a::attr(href)') item_loader.add_value('crawl_time', datetime.now()) job_item = item_loader.load_item() return job_item
三、编写items.py
import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose,TakeFirst,Join from w3lib.html import remove_tags from ArticleSpider.settings import SQL_DATE_FORMAT, SQL_DATETIME_FORMAT def remove_splash(value): # 去掉工作城市的斜线 return value.replace("/", "").strip() def handle_jobaddr(value): addr_list = value.split("\n") # addr = [] # for item in addr_list: # if item.strip() != "查看地图": # addr.append(item.strip()) # return ''.join(addr) # 简写 addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"] return ''.join(addr_list) class LagouJobItemLoader(ItemLoader): default_output_processor = TakeFirst() class LagouJobItem(scrapy.Item): # 拉勾网职位信息 title = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() salary = scrapy.Field( input_processor=MapCompose(remove_splash), ) job_city = scrapy.Field( input_processor=MapCompose(remove_splash), ) work_years = scrapy.Field( input_processor=MapCompose(remove_splash), ) degree_need = scrapy.Field( input_processor=MapCompose(remove_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field() job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field() company_url = scrapy.Field() tags = scrapy.Field( input_processor=Join(",") ) crawl_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title,url,url_object_id,salary,job_city,work_years,degree_need, job_type,publish_time,job_advantage,job_desc,job_addr,company_name,company_url, tags,crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc) """ params = ( self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], self["degree_need"], self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_name"], self["company_url"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT), ) return insert_sql, params
四、编写pipelines.py
# -*- coding: utf-8 -*- from twisted.enterprise import adbapi import MySQLdb import MySQLdb.cursors import json import codecs # codecs与open类似,但是减少了很多的编码工作 from scrapy.exporters import JsonItemExporter from scrapy.pipelines.images import ImagesPipeline class MysqlTwistedPipline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbparms = dict( host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"], user=settings["MYSQL_USER"], password=settings["MYSQL_PASSWORD"], charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms) return cls(dbpool) def process_item(self, item, spider): # 使用twisted将mysql插入编程异步执行 query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error) # 处理异常 def handle_error(self, failure): # 处理异步插入的异常 print(failure) def do_insert(self, cursor, item): # 执行具体的插入操作 insert_sql, params = item.get_insert_sql() cursor.execute(insert_sql, params)
五、编辑settings
ITEM_PIPELINES = { 'ArticleSpider.pipelines.MysqlTwistedPipline': 300, } MYSQL_HOST = "127.0.0.1" MYSQL_DBNAME = "article_spider" MYSQL_USER = "root" MYSQL_PASSWORD = "123" SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" SQL_DATE_FORMAT = "%Y-%m-%d"