Scrapy分页

一、scrapy详情页

runner.py

from scrapy.cmdline import execute

if __name__ == '__main__':
    execute("scrapy crawl book_spider".split())

book_spiders.py

import scrapy
from scrapy.http.response.html import HtmlResponse
from urllib.parse import urljoin


class BookSpiderSpider(scrapy.Spider):
    name = "book_spider"
    allowed_domains = ["shicimingju.com"]
    start_urls = ["https://shicimingju.com/book/hongloumeng.html"]

    def parse(self, response: HtmlResponse, **kwargs):
        # print(response.text)
        res = response.xpath("//div[@class='contbox cont11']//a")
        for item in res:
            book_url = item.xpath("./@href").extract_first()
            # print(book_url)
            # url拼接方式一
            # new_url = urljoin(BookSpiderSpider.start_urls[0], book_url)
            # url拼接方式二
            # new_url = urljoin(response.url, book_url)
            # print(new_url)

            # scrapy拼接url方式
            new_url = response.urljoin(book_url)
            # print(new_url)
            yield scrapy.Request(
                url=new_url,
                callback=self.parse_detail
            )
            # break
    # 详情页
    def parse_detail(self, response: HtmlResponse, **kwargs):
        # print(response.text)
        content = response.xpath("//div[@class='text p_pad']//text()").extract()
        content = "".join(content).replace("\xa0", "").strip()
        # print(content)
        yield {
            "book_url": response.url,
            "book_content": content
        }

piplines.py

class BookPipeline:
    def process_item(self, item, spider):
        # 打印详情页
        print(item)
        return item

settings.py

# Scrapy settings for book project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "book"

SPIDER_MODULES = ["book.spiders"]
NEWSPIDER_MODULE = "book.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "book (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"



# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "max-age=0",
    "cookie": "Hm_lvt_649f268280b553df1f778477ee743752=1731913174; HMACCOUNT=BE3BFACC7E1F214F; Hm_lpvt_649f268280b553df1f778477ee743752=1731913320",
    "priority": "u=0, i",
    "referer": "https://www.shicimingju.com/book",
    "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "book.middlewares.BookSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "book.middlewares.BookDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   "book.pipelines.BookPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

二、scrapy分页

runer.py

from scrapy.cmdline import execute

if __name__ == '__main__':
    execute("scrapy crawl shu".split())

shu.py

import scrapy


class ShuSpider(scrapy.Spider):
    name = "shu"
    allowed_domains = ["dushu.com"]
    start_urls = ["https://www.dushu.com/book/1188_1.html"]

    def parse(self, response, **kwargs):
        print(response.url)  #分页,效果
        book_infos = response.xpath('//div[@class="book-info"]')
        for book_info in book_infos:
            href = book_info.xpath('.//h3/a/@href').extract_first()
            new_url = response.urljoin(href)
            # print(new_url)

            yield scrapy.Request(
                # 详情页url
                url=new_url,
                callback=self.parse_detail
            )
        # 处理分页
        a_list = response.xpath("//div[@class='pages']/a")
        for a in a_list:
            href = a.xpath("./@href").extract_first()
            fenye_url = response.urljoin(href)
            # 直接发请求就可以了. 哪怕是重复的url也没问题
            yield scrapy.Request(
                url=fenye_url,
                callback=self.parse  # 分页逻辑中, callback就是他自己
            )


    def parse_detail(self, response):
        # 书名
        name = response.xpath('//div[@class="book-title"]/h1/text()').extract_first()
        # 作者
        author = response.xpath('//div[@class="book-details"]//tr/td[2]/text()').extract_first()

        yield {
            'name': name,
            'author': author
        }

pipelines.py

class DushuPipeline:
    def process_item(self, item, spider):
        print(item)
        return item

setting.py

# Scrapy settings for dushu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "dushu"

SPIDER_MODULES = ["dushu.spiders"]
NEWSPIDER_MODULE = "dushu.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "dushu (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"



# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
   "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
   "accept-encoding": "gzip, deflate, br, zstd",
   "accept-language": "zh-CN,zh;q=0.9",
   "cache-control": "max-age=0",
   "cookie": "Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1732085842; HMACCOUNT=BE3BFACC7E1F214F; __gads=ID=17dcdd5451ae0620:T=1732086997:RT=1732090519:S=ALNI_MZMXqqiZj5OtUfMkZsq7u3ui3J_eQ; __gpi=UID=00000f99c374b4db:T=1732086997:RT=1732090519:S=ALNI_MYxmun9cz-LQpIQLeRvTL7vMf_M2g; __eoi=ID=dd0db9b091de93d2:T=1732086997:RT=1732090519:S=AA-AfjbLn9GNmUWnL1isA5Q6NFpH; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1732091957",
   "priority": "u=0, i",
   "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
   "sec-ch-ua-mobile": "?0",
   "sec-ch-ua-platform": "\"Windows\"",
   "sec-fetch-dest": "document",
   "sec-fetch-mode": "navigate",
   "sec-fetch-site": "none",
   "sec-fetch-user": "?1",
   "upgrade-insecure-requests": "1",
   "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "dushu.middlewares.DushuSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "dushu.middlewares.DushuDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   "dushu.pipelines.DushuPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

三、scrapy下载图片

runner.py

from scrapy.cmdline import execute

if __name__ == '__main__':
    execute("scrapy crawl dongman".split())

dongman.py

import scrapy
import re
import json


class DongmanSpider(scrapy.Spider):
    name = "dongman"
    allowed_domains = ["zol.com.cn", "zol-img.com.cn"]
    start_urls = ["https://desk.zol.com.cn/dongman/"]

    obj = re.compile(r"var deskPicArr 		= (?P<code>.*?);", re.S)

    def parse(self, resp, **kwargs):
        li_list = resp.xpath('//li[@class="photo-list-padding"]')
        for li in li_list:
            href = li.xpath("./a/@href").extract_first()
            if href.endswith(".exe"):
                continue
            new_url = resp.urljoin(href)
            # print(new_url)
            yield scrapy.Request(
                url=new_url,
                callback=self.parse_detail
            )
            # break

    def parse_detail(self, resp, **kwargs):
        # 详情页处理逻辑:
        # 拿到图片下载地址
        # src = resp.xpath('//img[@class="bigImg"]/@src').extract_first()
        # 下载图片
        # print(resp.text)
        title = resp.xpath("//title/text()").extract_first().split("-")[0]
        code = DongmanSpider.obj.search(resp.text).group("code")
        # print(code)
        dic = json.loads(code)
        for item in dic['list']:
            fenban = item["resAll"][0]
            img_src = item["imgsrc"]
            img_src = img_src.replace("##SIZE##", fenban + "c5")
            # print(img_src)

            yield {
                "img_src": img_src,
                "title": title,
                "referer": resp.url
            }
# "http://desk-fd.zol-img.com.cn/t_s##SIZE##/g5/M00/0C/05/ChMkJ14dLNeIfJBuAAZKuwc_TagAAwWUAJPWVAABkrT442.jpg"
# https://desk-fd.zol-img.com.cn/t_s144x90c5/g5/M00/0C/05/ChMkJ14dLNeIfJBuAAZKuwc_TagAAwWUAJPWVAABkrT442.jpg

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
import scrapy


class TuPipeline:
    def process_item(self, item, spider):
        print(item)
        return item


class MytuPipeline(ImagesPipeline):
    # 发请求
    def get_media_requests(self, item, info):
        yield scrapy.Request(
            url=item["img_src"],
            headers={
                "referer": item["referer"]
            },
            meta={
                "title": item["title"],
                "img_src": item["img_src"]}
        )

    # 保存图片
    def file_path(self, request, response=None, info=None, *, item=None):
        my_path = 'tu/'
        title = request.meta["title"]
        file_name = request.meta["img_src"].split("/")[-1]
        return my_path + title + "/" + file_name

    # 下载完成之后,收尾的操作
    def item_completed(self, results, item, info):
        return item

setting.py

# Scrapy settings for tu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "tu"

SPIDER_MODULES = ["tu.spiders"]
NEWSPIDER_MODULE = "tu.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "tu (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "no-cache",
    "cookie": "ip_ck=3Yqq14y/v8EuOTc2MjA1LjE3MDE2NzE2NjM%3D; z_pro_city=s_provice%3Dbeijing%26s_city%3Dbeijing; userProvinceId=1; userCityId=0; userCountyId=0; userLocationId=1; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1724053531,1725013913; HMACCOUNT=0BFAD8D83E97B549; lv=1725025604; vn=12; z_day=icnmo11564%3D1%26ixgo20%3D1; Adshow=5; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1725025698; questionnaire_pv=1724976004",
    "dnt": "1",
    "pragma": "no-cache",
    "priority": "u=0, i",
    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    "tu.middlewares.TuSpiderMiddleware": 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    "tu.middlewares.TuDownloaderMiddleware": 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    "tu.pipelines.TuPipeline": 300,
    "tu.pipelines.MytuPipeline": 250
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

MEDIA_ALLOW_REDIRECTS = True
# 下载图片. 必须要给出一个配置
# 总路径配置
IMAGES_STORE = "./"
posted @   沈忻凯  阅读(17)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
点击右上角即可分享
微信分享提示