Scrapy分页
一、scrapy详情页
runner.py
from scrapy.cmdline import execute
if __name__ == '__main__':
execute("scrapy crawl book_spider".split())
book_spiders.py
import scrapy
from scrapy.http.response.html import HtmlResponse
from urllib.parse import urljoin
class BookSpiderSpider(scrapy.Spider):
name = "book_spider"
allowed_domains = ["shicimingju.com"]
start_urls = ["https://shicimingju.com/book/hongloumeng.html"]
def parse(self, response: HtmlResponse, **kwargs):
# print(response.text)
res = response.xpath("//div[@class='contbox cont11']//a")
for item in res:
book_url = item.xpath("./@href").extract_first()
# print(book_url)
# url拼接方式一
# new_url = urljoin(BookSpiderSpider.start_urls[0], book_url)
# url拼接方式二
# new_url = urljoin(response.url, book_url)
# print(new_url)
# scrapy拼接url方式
new_url = response.urljoin(book_url)
# print(new_url)
yield scrapy.Request(
url=new_url,
callback=self.parse_detail
)
# break
# 详情页
def parse_detail(self, response: HtmlResponse, **kwargs):
# print(response.text)
content = response.xpath("//div[@class='text p_pad']//text()").extract()
content = "".join(content).replace("\xa0", "").strip()
# print(content)
yield {
"book_url": response.url,
"book_content": content
}
piplines.py
class BookPipeline:
def process_item(self, item, spider):
# 打印详情页
print(item)
return item
settings.py
# Scrapy settings for book project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "book"
SPIDER_MODULES = ["book.spiders"]
NEWSPIDER_MODULE = "book.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "book (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": "Hm_lvt_649f268280b553df1f778477ee743752=1731913174; HMACCOUNT=BE3BFACC7E1F214F; Hm_lpvt_649f268280b553df1f778477ee743752=1731913320",
"priority": "u=0, i",
"referer": "https://www.shicimingju.com/book",
"sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "book.middlewares.BookSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "book.middlewares.BookDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"book.pipelines.BookPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
二、scrapy分页
runer.py
from scrapy.cmdline import execute
if __name__ == '__main__':
execute("scrapy crawl shu".split())
shu.py
import scrapy
class ShuSpider(scrapy.Spider):
name = "shu"
allowed_domains = ["dushu.com"]
start_urls = ["https://www.dushu.com/book/1188_1.html"]
def parse(self, response, **kwargs):
print(response.url) #分页,效果
book_infos = response.xpath('//div[@class="book-info"]')
for book_info in book_infos:
href = book_info.xpath('.//h3/a/@href').extract_first()
new_url = response.urljoin(href)
# print(new_url)
yield scrapy.Request(
# 详情页url
url=new_url,
callback=self.parse_detail
)
# 处理分页
a_list = response.xpath("//div[@class='pages']/a")
for a in a_list:
href = a.xpath("./@href").extract_first()
fenye_url = response.urljoin(href)
# 直接发请求就可以了. 哪怕是重复的url也没问题
yield scrapy.Request(
url=fenye_url,
callback=self.parse # 分页逻辑中, callback就是他自己
)
def parse_detail(self, response):
# 书名
name = response.xpath('//div[@class="book-title"]/h1/text()').extract_first()
# 作者
author = response.xpath('//div[@class="book-details"]//tr/td[2]/text()').extract_first()
yield {
'name': name,
'author': author
}
pipelines.py
class DushuPipeline:
def process_item(self, item, spider):
print(item)
return item
setting.py
# Scrapy settings for dushu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "dushu"
SPIDER_MODULES = ["dushu.spiders"]
NEWSPIDER_MODULE = "dushu.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "dushu (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"cookie": "Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1732085842; HMACCOUNT=BE3BFACC7E1F214F; __gads=ID=17dcdd5451ae0620:T=1732086997:RT=1732090519:S=ALNI_MZMXqqiZj5OtUfMkZsq7u3ui3J_eQ; __gpi=UID=00000f99c374b4db:T=1732086997:RT=1732090519:S=ALNI_MYxmun9cz-LQpIQLeRvTL7vMf_M2g; __eoi=ID=dd0db9b091de93d2:T=1732086997:RT=1732090519:S=AA-AfjbLn9GNmUWnL1isA5Q6NFpH; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1732091957",
"priority": "u=0, i",
"sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "dushu.middlewares.DushuSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "dushu.middlewares.DushuDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"dushu.pipelines.DushuPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
三、scrapy下载图片
runner.py
from scrapy.cmdline import execute
if __name__ == '__main__':
execute("scrapy crawl dongman".split())
dongman.py
import scrapy
import re
import json
class DongmanSpider(scrapy.Spider):
name = "dongman"
allowed_domains = ["zol.com.cn", "zol-img.com.cn"]
start_urls = ["https://desk.zol.com.cn/dongman/"]
obj = re.compile(r"var deskPicArr = (?P<code>.*?);", re.S)
def parse(self, resp, **kwargs):
li_list = resp.xpath('//li[@class="photo-list-padding"]')
for li in li_list:
href = li.xpath("./a/@href").extract_first()
if href.endswith(".exe"):
continue
new_url = resp.urljoin(href)
# print(new_url)
yield scrapy.Request(
url=new_url,
callback=self.parse_detail
)
# break
def parse_detail(self, resp, **kwargs):
# 详情页处理逻辑:
# 拿到图片下载地址
# src = resp.xpath('//img[@class="bigImg"]/@src').extract_first()
# 下载图片
# print(resp.text)
title = resp.xpath("//title/text()").extract_first().split("-")[0]
code = DongmanSpider.obj.search(resp.text).group("code")
# print(code)
dic = json.loads(code)
for item in dic['list']:
fenban = item["resAll"][0]
img_src = item["imgsrc"]
img_src = img_src.replace("##SIZE##", fenban + "c5")
# print(img_src)
yield {
"img_src": img_src,
"title": title,
"referer": resp.url
}
# "http://desk-fd.zol-img.com.cn/t_s##SIZE##/g5/M00/0C/05/ChMkJ14dLNeIfJBuAAZKuwc_TagAAwWUAJPWVAABkrT442.jpg"
# https://desk-fd.zol-img.com.cn/t_s144x90c5/g5/M00/0C/05/ChMkJ14dLNeIfJBuAAZKuwc_TagAAwWUAJPWVAABkrT442.jpg
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class TuPipeline:
def process_item(self, item, spider):
print(item)
return item
class MytuPipeline(ImagesPipeline):
# 发请求
def get_media_requests(self, item, info):
yield scrapy.Request(
url=item["img_src"],
headers={
"referer": item["referer"]
},
meta={
"title": item["title"],
"img_src": item["img_src"]}
)
# 保存图片
def file_path(self, request, response=None, info=None, *, item=None):
my_path = 'tu/'
title = request.meta["title"]
file_name = request.meta["img_src"].split("/")[-1]
return my_path + title + "/" + file_name
# 下载完成之后,收尾的操作
def item_completed(self, results, item, info):
return item
setting.py
# Scrapy settings for tu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "tu"
SPIDER_MODULES = ["tu.spiders"]
NEWSPIDER_MODULE = "tu.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "tu (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"cookie": "ip_ck=3Yqq14y/v8EuOTc2MjA1LjE3MDE2NzE2NjM%3D; z_pro_city=s_provice%3Dbeijing%26s_city%3Dbeijing; userProvinceId=1; userCityId=0; userCountyId=0; userLocationId=1; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1724053531,1725013913; HMACCOUNT=0BFAD8D83E97B549; lv=1725025604; vn=12; z_day=icnmo11564%3D1%26ixgo20%3D1; Adshow=5; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1725025698; questionnaire_pv=1724976004",
"dnt": "1",
"pragma": "no-cache",
"priority": "u=0, i",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "tu.middlewares.TuSpiderMiddleware": 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "tu.middlewares.TuDownloaderMiddleware": 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"tu.pipelines.TuPipeline": 300,
"tu.pipelines.MytuPipeline": 250
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
MEDIA_ALLOW_REDIRECTS = True
# 下载图片. 必须要给出一个配置
# 总路径配置
IMAGES_STORE = "./"