scrapy 文件下载配置

scrapy 文件下载配置

爬取matplotlib作图库

    matplotlib是非常有用的作图库,官网上提供了许多实例,可在’http://matplotlib.org/examples/index.html’ 查到,我们就把这些文件下载到本地,方便以后查找使用。

1 pipelines.py 管道模块 自定义管道下载类-主要是改名字

from urllib import parse
from os.path import dirname,basename,join
from scrapy.pipelines.files import FilesPipeline

class MatpDownloadPipeline(FilesPipeline):

    def file_path(self, request, response=None, info=None):
        # print(request.url)
        path=parse.urlparse(request.url).path
        dir_name=dirname(path)
        base_name=basename(path)
        return join(basename(dir_name),base_name)

2 settings.py 配置

ITEM_PIPELINES = {
     配置自己自定义的下载管道
    'img.pipelines.MatpDownloadPipeline':200,

}
文件存储路径
FILES_STORE = 'examples_src'

3爬虫应用代码

import scrapy
from ..items import MatpItem
from scrapy.linkextractors import LinkExtractor
from urllib import parse

class AnmSpider(scrapy.Spider):
    name = 'anm'
    allowed_domains = ['matplotlib.org']
    start_urls = ['https://matplotlib.org/examples/index.html']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,callback=self.parse,encoding="UTF-8")

    def parse(self, response):
        le=LinkExtractor(restrict_xpaths="//div[@class='toctree-wrapper compound']/ul/li/ul/li/a")
        links=le.extract_links(response)
        for link in links:
            # print(link.url)
            yield scrapy.Request(link.url,callback=self.parse_detail,encoding="UTF_8")

    def parse_detail(self, response):
        source_url=response.xpath("//a[@class='reference external']/@href").extract_first()
        source=response.urljoin(source_url)
        
        items=MatpItem()
        items["file_urls"]=[source]
        return items
posted @ 2020-10-21 19:37  小杜打醋尢买布  阅读(230)  评论(0编辑  收藏  举报