python爬虫学习:从数据库读取目标爬虫站点及爬虫规则,批量爬取目标站点指定数据(scrapy框架)

  1. 数据库databaseConfig.py
from urllib.parse import quote_plus
from pymongo import MongoClient
import settings


class DB:
    def __init__(self):
        # 从配置文件总获取数据库连接的参数
        host = settings.MONGODB_HOST
        port = settings.MONGODB_PORT
        dbname = settings.MONGODB_DBNAME
        user_name = settings.MONGODB_USERNAME
        password = settings.MONGODB_PASSWORD

        # 存放爬取数据的表名
        self.spider_result_sheet_name = settings.MONGODB_SAVE_SPIDER_RESULT_SHEET_NAME
        # 存放爬虫目标网站信息
        self.spider_station_sheet_name = settings.MONGODB_SPIDER_STATION_SHEET_NAME

        # 创建MONGODB数据库链接
        uri = "mongodb://%s:%s@%s:%s" % (quote_plus(user_name), quote_plus(password),
                                         quote_plus(host), quote_plus(port))
        client = MongoClient(uri)
        # 指定数据库
        self.collection = client[dbname]
  1. 修改 scrapy 框架的 pipelines.py 文件,添加爬虫数据保存到数据库的方法
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import codecs
import json
import os
from MySpider.databaseConfig import DB


class MyScrapyPipeline:
    def process_item(self, item, spider):
        return item

# # 以json文件保存
# class JsonPipeline(object):
#     def process_item(self, item, spider):
#         # base_dir = os.getcwd()
#         # filename = base_dir + '/spiderData.json'
#         filename = 'D:/development/datas' + '/spiderData.json'
#         # 打开json文件,向里面以dumps的方式吸入数据
#         # 注意需要有一个参数ensure_ascii=False ,不然数据会直接为utf编码的方式存入比如
#         # :“/xe15”
#         with codecs.open(filename, 'a', encoding='utf-8') as f:
#             line = json.dumps(dict(item), ensure_ascii=False) + '\n'
#             f.write(line)
#         return item


# 保存到mongodb数据库
class SpiderMongoPipeline(object):
    def process_item(self, item, spider):
        data = dict(item)
        db = DB()
        db.collection[db.spider_result_sheet_name].insert(data)
        return item
  1. 编辑items.py 对应数据库字段
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field


class MyDataItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = Field()
    author = Field()
    release_time = Field()
    url = Field()
    create_time = Field()
    # pass
  1. 核心爬虫方法mySpider.py
# coding=utf-8

import time
import scrapy
from scrapy.selector import Selector
from mySpider.databaseConfig import DB
from mySpider.items import MyDataItem


class MySpider(scrapy.Spider):
    name = 'mySpider'  # 爬虫的唯一标识,不能重复,启动爬虫的时候要用

    # 重写Scrapy的start_requests方法
    def start_requests(self):
        # 数据库连接
        collection = DB().collection[DB().spider_station_sheet_name]
        items = collection.find() # 从数据库中查询所有需要爬取的站点信息
        for item in items:
            station_url = item["station_url"] # 目标站点url
            yield scrapy.Request(url=station_url, meta=item, callback=self.parse_station)


     # 站点爬虫方法
    def parse_station(self, response):
        meta = response.meta  # 从请求上获取手动传入的meta参数
        articles = Selector(response).xpath(meta["table_xpath"]) # 获取到文章列表
        for article in articles:
            article_detail_url = meta["station_root_url"] + article.xpath(meta["article_detail_xpath"]).extract()[0]
            # dont_filter=True 表示不过滤,不然会导致parse_detail只执行一次
            yield scrapy.Request(url=article_detail_url, meta=meta, callback=self.parse_detail, dont_filter=True) 

    # 爬取详情页
    def parse_detail(self, response):
        items = MyDataItem()
        current_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

        meta = response.meta
        selector = Selector(response)
        items['title'] = selector.xpath(meta["title_xpath"]).extract()[0]
        items['author'] = meta["station_name"] if meta["author_xpath"] == "" else selector.xpath(meta["author_xpath"]).extract()[0]
        items['release_time'] = selector.xpath(meta["release_time_xpath"]).extract()[0]
        items['url'] = response.url
        items['create_time'] = current_time
        yield items # 提交爬虫信息(到pipelines.py)

dont_filter=True 表示不过滤,不然会导致parse_detail只执行一次,这是一个坑点,前期由于查询资料的方向和关键字不对,导致卡壳很久。最后搜到scrapy - Request 中的回调函数不执行或者只执行一次这篇文章才得以解决

  1. settings.py 修改(以下为settings.py的部分配置内容)
BOT_NAME = 'mySpider'

SPIDER_MODULES = ['myScrapy.spiders'] # 爬虫核心方法所在的项目文件路径(从项目根开始)
NEWSPIDER_MODULE = 'myScrapy.spiders'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# LOG_LEVEL = 'ERROR'

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
}

# 优先级
ITEM_PIPELINES = {
    'myScrapy.pipelines.SpiderMongoPipeline': 200 
}

# 以下mongodb数据库配置信息省略
  1. 启动类main.py
# 方法一:执行单一爬虫模块,并根据pipeline.py的配置保存
from scrapy import cmdline
cmdline.execute("scrapy crawl recruit".split())

# 方法二:执行单一爬虫模块,以文件形式保存(在当前项目根)
# cmdline.execute("scrapy crawl recruit -o rsj.json".split())


# 方法三: 批量制定执行爬虫模块
# 批量方法1
# cmdline.execute("scrapy crawlProcess rsj cqgsdx".split())
# 批量方法2
# cmdline.execute(['scrapy', 'crawl', 'recruit'])

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# 方法四:批量运行spider
# process = CrawlerProcess(get_project_settings())
# didntWorkSpider = ['rsj', 'cqgsdx'] # 不需要执行的spider模块

# process_spider_list = process.spiders.list() # 取spiders路径下所有的spider模块
# for the_spider_name in process_spider_list:
#     if the_spider_name in didntWorkSpider:
#         continue
#     print("Running spider %s" % (the_spider_name))
#     process.crawl(the_spider_name)

# process.start()
posted @ 2021-01-12 10:18  httpc  阅读(612)  评论(0编辑  收藏  举报