Scrapy
1、安装scrapy
win7下conda安装scrapy
conda search scrapy
conda install scray=2.8.0
将C:\Program Files\Anaconda3\envs\my_env3.8\Scripts加入环境变量
这样cmd中就可以使用scrapy命令
cmd需要重启。
性能相关及Scrapy笔记-博客园 武沛齐
Scrapy教程--Scrapy 2.5.0文档
2、Scrapy项目配置
- 要爬取网站使用的可信任证书(默认支持):
setting.py中加入
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
-
运行:
出现2023-12-12 20:40:05 [scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt: <GET https://space.bilibili.com/3493119047764705/video>
爬虫出现Forbidden by robots.txt
在setting改变ROBOTSTXT_OBEY为False,让scrapy不要遵守robot协议,之后就能正常爬取了。
ROBOTSTXT_OBEY = False
-
scrapy抓取豆瓣报出403错误?
在setting.py中设置USER_AGENT 伪装成浏览器即可
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
3、第一个scrapy应用
douban.py
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')
for item in list_items:
movie_item = MovieItem()
movie_item['title'] = item.css('span.title::text').extract_first()
movie_item['rating_num'] = item.css('span.rating_num::text').extract_first()
movie_item['subject'] = item.css('span.inq::text').extract_first()
yield movie_item
items.py
import scrapy
class MovieItem(scrapy.Item):
title = scrapy.Field()
rating_num = scrapy.Field()
subject = scrapy.Field()
运行 scrapy crawl douban -o douban.csv
将数据输出到douban.csv文件
4、爬取多页
import scrapy
from scrapy import Selector, Request
from scrapy.http import HtmlResponse
from Test1.items import MovieItem
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response: HtmlResponse):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')
for item in list_items:
movie_item = MovieItem()
movie_item['title'] = item.css('span.title::text').extract_first()
movie_item['rating_num'] = item.css('span.rating_num::text').extract_first()
movie_item['subject'] = item.css('span.inq::text').extract_first()
yield movie_item
# print(response.text)
hrefs_list = sel.css('div.paginator > a::attr(href)')
for href in hrefs_list:
url_parms = href.extract()
url = response.urljoin(url_parms)
yield Request(url=url)
这里有个bug:
爬第2页时,还会取到第一页的url:“https://movie.douban.com/top250?start=0&filter=” 再爬一次第一页,由于第一次爬第一页是https://movie.douban.com, 所以scrapy无法去重。
解决办法1: start_urls = ["https://movie.douban.com/top250?start=0&filter="]
解决办法2(推荐):不解析页码url,直接构造start_requests
...
class DoubanSpider(scrapy.Spider):
name = "douban"
allowed_domains = ["movie.douban.com"]
# start_urls = ["https://movie.douban.com/top250?start=0&filter="]
def start_requests(self):
for page in range(10):
yield Request(url=f'https://movie.douban.com/top250?start={ page * 25 }&filter=')
def parse(self, response: HtmlResponse):
...
6、将数据写入Excel:
安装openpyxl:pip install openpyxl
pipelines.py:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import openpyxl
class Test1Pipeline:
def __init__(self):
self.wb = openpyxl.Workbook() #创建一个工作簿
#wb.create_sheet() # 创建一张新的工作表
self.ws = self.wb.active # 默认的工作表
self.ws.title = 'Top250'
self.ws.append(('标题', '评分', '主题')) # 第一行
def close_spider(self, spider):
self.wb.save('电影数据.xlsx')
def process_item(self, item, spider):
title = item.get('title', '')
ratting = item.get('rating_num') or ''
sb = item.get('subject', '')
self.ws.append((title, ratting, sb))
return item # return了,终端就会打印
setting.py中,将管道配置取消注释。300是优先级,数字越小,越先执行。
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"Test1.pipelines.Test1Pipeline": 300,
}
7、将数据写入MySQL
创建数据库:CREATE DATABASE Test1Spider
使用数据库:use test1spider
删除表:DROP TABLE 语句用于删除数据库中的现有表:
-- 删除表,如果存在的话
DROP TABLE IF EXISTS mytable;
-- 直接删除表,不检查是否存在
DROP TABLE mytable;
创建数据表:(其中需要注意:每字段前不是tab,而是两个空格)
CREATE TABLE IF NOT EXISTS `tb_top_movie`(
`mov_id` INT UNSIGNED AUTO_INCREMENT comment '编号',
`title` varchar(50) NOT NULL comment '标题',
`rating` decimal(3,1) NOT NULL comment '评分',
`subject` varchar(200) default '' comment '主题',
PRIMARY KEY (`mov_id`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8 comment='Top电影表';
完美解决 ERROR 1064 (42000): You have an error in your SQL syntax
pipelines.py
import pymysql
class DbPipeline:
def __init__(self):
self.conn = pymysql.connect(host='localhost',
port=3306,
user='root',
password='123456',
database='test1spider',
charset='utf8mb4')
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.commit()
self.conn.close()
def process_item(self, item, spider):
title = item.get('title', '')
rating = item.get('rating_num') or 0
sb = item.get('subject', '')
self.cursor.execute(
'insert into tb_top_movie (title, rating, subject) values (%s, %s, %s)',
(title, rating, sb)
)
return item
批处理:
import pymysql
class DbPipeline:
def __init__(self):
self.conn = pymysql.connect(host='localhost',
port=3306,
user='root',
password='123456',
database='test1spider',
charset='utf8mb4')
self.cursor = self.conn.cursor()
self.data = []
def close_spider(self, spider):
if len(self.data) > 0:
self._write_to_db()
self.conn.close()
def process_item(self, item, spider):
title = item.get('title', '')
rating = item.get('rating_num') or 0
sb = item.get('subject', '')
self.data.append((title, rating, sb))
if len(self.data) == 100:
self._write_to_db()
return item # return了,下一个pipeline才能拿到item继续处理
def _write_to_db(self):
self.cursor.executemany(
'insert into tb_top_movie (title, rating, subject) values (%s, %s, %s)',
self.data
)
self.conn.commit()
self.data.clear()
8、其它
class TestSpider(scrapy.Spider):
...
def parse(self, response: HtmlResponse): # 这时,pycharm会有警告:由于这个是重写父类方法,但它没有与父类方法保持一致
...
解决:
class TestSpider(scrapy.Spider):
...
def parse(self, response: HtmlResponse, **kwargs): # 父类方法有**kwargs参数
...
- 代理ip:
若某ip的请求不可行,用到代理ip。则在request中设置代理ip
def start_requests(self):
for page in range(10):
yield Request(
url=f'https://movie.douban.com/top250?start={ page * 25 }&filter=',
meta={'proxy': 'socks5://127.0.0.1:1086'}
)
其中:socks5是套接字代理用到的协议;127.0.0.1:1086:本机的1086端口在开代理。 商业代理一般是http或https。
另一种方法是 中间件...
Test1DownloaderMiddleware中的process_request方法,对request:
request.meta = {'proxy':'....'}
- 在请求中加入cookies
有时某个ip的网络请求不可行,这时除了用代理之外,给请求加入登录成功的cookies可能可以解决问题
def start_requests(self):
for page in range(10):
yield Request(
url=f'https://movie.douban.com/top250?start={ page * 25 }&filter=',
cookies= ...
)
方法二:在中间件中加入cookies
def get_cookies_dict():
cookies_str = 'douban-fav-remind=1; bid=8NgbfwzgpIA; __gads=ID=ebf0a13aeca46d11-22159676bce10041:T=1687258749:RT=1687258749:S=ALNI_MZCPv23l6cII6HAOrbfIoJvMS1pbA; __utmz=30149280.1698919746.13.9.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_id.100001.8cb4=fb04774daf045c88.1698919784.; __utmv=30149280.7078; ll="108306"; _pk_ref.100001.8cb4=["","",1706082075,"https://cn.bing.com/"]; _pk_ses.100001.8cb4=1; __utma=30149280.2143285512.1648656872.1706008716.1706082077.16; __utmc=30149280; __utmt=1; dbcl2="70788201:zycW4xRjuCs"; ck=vJln; ap_v=0,6.0; __yadk_uid=hQ2nhqzz4xernrj5qibOs9WpLVLGqUdp; push_noty_num=0; push_doumail_num=0; __utmb=30149280.3.10.1706082077'
cookies_dict = {}
for item in cookies_str.split(';'):
key, value = item.split('=', maxsplit=1)
cookies_dict[key] = value
return cookies_dict
COOKIES_DICT = get_cookies_dict()
...
class Test1DownloaderMiddleware:
...
def process_request(self, request, spider):
request.cookies = COOKIES_DICT
return None
...
需要在setting.py中去掉下载中间件的注释
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"Test1.middlewares.Test1DownloaderMiddleware": 543,
}
9、爬取详情页
修改mysql数据库的表
CREATE TABLE IF NOT EXISTS `tb_top_movie`(
`mov_id` INT UNSIGNED AUTO_INCREMENT comment '编号',
`title` varchar(50) NOT NULL comment '标题',
`rating` decimal(3,1) NOT NULL comment '评分',
`subject` varchar(200) default '' comment '主题',
`duration` int not null comment '时长',
`intro` varchar(500) default '' comment '简介',
PRIMARY KEY (`mov_id`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
items.py中: MovieItem增加对应字段
pipelines.py中, DbPipeline、ExcelPipeline针对增加字段作相应修改
douban.py:
yield Request(url=href,
callback=self.parse_detail,
cb_kwargs={'item': movie_item},
)
def parse(self, response: HtmlResponse, **kwargs):
sel = Selector(response)
list_items = sel.css('#content > div > div.article > ol > li')
for item in list_items:
href = item.css('div > div.info > div.hd > a::attr(href)').extract_first()
movie_item = MovieItem()
movie_item['title'] = item.css('span.title::text').extract_first()
movie_item['rating_num'] = item.css('span.rating_num::text').extract_first()
movie_item['subject'] = item.css('span.inq::text').extract_first()
yield Request(url=href,
callback=self.parse_detail,
cb_kwargs={'item': movie_item},
)
def parse_detail(self, response: HtmlResponse, **kwargs):
sel = Selector(response)
duration = sel.css('span[property="v:runtime"]::attr(content)').extract_first()
intro = sel.css('span[property="v:summary"]::text').extract_first()
movie_item = kwargs['item']
movie_item['duration'] = duration
movie_item['intro'] = intro
yield movie_item
实践
当爬取的页面是动态内容,需要使用selenium
- selenium配置
from selenium import webdriver
#视频代码
def create_chrome_driver(*, headless=False):
options = webdriver.ChromeOptions()
if headless:
options.add_argument('--headless')
# 反防爬: 页头不会出现‘浏览器正在受测试..控制’
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
browser = webdriver.Chrome(options=options) # 驱动文件的路径不用指定的前提是驱动文件放在环境变量的bin文件夹中,如venv/bin/...exe
# 反防爬: 执行js代码:找到浏览器的navigator对象,该对象有webdriver属性,当使用selenium的时候,该属性为true,这里js代码将属性值改为undefined
browser.execute_cdp_cmd(
'Page.addScriptToEvaluateOnNewDocument',
{'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'}
)
return browser
Python: 在 Edge 上使用 selenium
防止 Selenium 被检测出来,解决方法
selenium消除启动特征避免被反爬
根据自己本机是edge浏览器:
def create_edge_driver(*, headless=False):
driverfile_path = "C:\Program Files\Anaconda3\envs\my_env3.8\selenium_EdgeDriver_win64\edgedriver_win64\msedgedriver.exe"
options = EdgeOptions()
options.use_chromium = True # 如果将 UseChromium 属性设置为 true,可以使用 EdgeOptions 类来访问在自动化其他Chromium浏览器时使用的相同的 特定于 Chromium 的属性和方法
if headless:
options.add_argument('headless') # 设置无界面 可选
options.add_argument("disable-gpu")
# 实现规避检测
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
# # 屏蔽webdriver特征 window.navigator.webdriver = false
# options.add_argument('--disable-blink-features=AutomationControlled')
service_args = ['--verbose'] # 使用 Python 时, Edge 对象将创建和管理 EdgeService 。 若要配置 EdgeService ,请向 对象传递 Edge 额外参数
browser = Edge(service_args=service_args, executable_path=driverfile_path, options=options)
browser.execute_cdp_cmd( # window.navigator.webdriver = undefined
'Page.addScriptToEvaluateOnNewDocument',
{'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'}
)
return browser
-
模拟登录
在完成登录之后,将cookies保存到json文件,爬虫网络请求时,读取cookies,加到网络请求中。(mainSpider项目已应用) -
scrapy爬取动态内容的网页
关键:在下载中间件中应用selenium来下载页面内容
middlewares.py:
from scrapy.http import HtmlResponse
from TaoSpider.utils import create_edge_driver, add_cookies
...
class TaospiderDownloaderMiddleware:
...
def __init__(self):
self.browser = create_edge_driver()
self.browser.get('https://www.taobao.com')
add_cookies(self.browser, 'taobao2.json')
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
self.browser.get(request.url)
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8')
pipelines.py:
from scrapy.crawler import Crawler
import pymysql
class TaospiderPipeline:
# scrapy中类的实例方法
@classmethod
def from_crawler(cls, crawler: Crawler):
# 通过crawler可拿到settings.py的配置
host = crawler.settings['DB_HOST']
port = crawler.settings['DB_PORT']
username = crawler.settings['DB_USER']
password = crawler.settings['DB_PASS']
database = crawler.settings['DB_NAME']
return cls(host, port, username, password, database)
def __init__(self, host, port, username, password, database):
self.conn = pymysql.connect(host=host,
port=port,
user=username,
password=password,
database=database,
charset='utf8mb4',
autocommit=True)
self.cursor = self.conn.cursor()
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
title = item.get('title', '')
price = item.get('price', '')
deal_count = item.get('deal_count', '')
addr = item.get('addr', '')
self.cursor.execute(
'insert into taobao_test (title, price, deal_count, addr) values (%s, %s, %s, %s)',
(title, price, deal_count, addr)
)
return item
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· 单线程的Redis速度为什么快?
· 展开说说关于C#中ORM框架的用法!
· Pantheons:用 TypeScript 打造主流大模型对话的一站式集成库