目录
登录 翻页
import scrapy
class BhcSpider(scrapy.Spider):
name = 'bhc'
# allowed_domains = ['www.bhc.com']
start_urls = ['https://www.bhc.com/list']
login_url = 'https://www.bhc.com/auth/token'
def start_requests(self): # 首先会执行 start_requests 方法
form_data = {
'username': 'bhc_username',
'password': 'bhcbhc'
}
yield scrapy.FormRequest(
self.login_url,
formdata=form_data,
)
def parse(self, response, **kwargs): # 默认回调给 parse
token = 'Bearer ' + response.json()['accessToken']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Authorization': token,
'Content-Type': 'application/json;charset=UTF-8',
'Referer': 'https://www.bhc.com/',
}
for page in range(1, 1000):
yield scrapy.Request(f'https://www.bhc.com/api/kols?skip={page}&take=50', headers=headers, callback=self.parse_content)
def parse_content(self, response):
resu = response.json()['items']
print(resu)
一个 Scrapy 项目下多个爬虫同时运行
可以使用 Scrapy 的CrawlerProcess
在 spiders 目录下再次新建一个 wby.py 的爬虫文件
main.py 多个爬虫同时启动
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
project_settings = get_project_settings()
crawler = CrawlerProcess(project_settings)
crawler.crawl('sky')
crawler.crawl('wby')
crawler.start()
pipeline.py 多个 spider 指定 pipeline
在 pipeline 里判断是哪个爬虫的结果
# pipiline.py
class SkyUpdatePipeline:
def process_item(self, item, spider):
if spider.name != 'sky':
return
print('sky pipeline')
kol = item['kol']
name = kol['accountName']
return item
class WeiboyiCrawlPipeline:
def process_item(self, item, spider):
if spider.name != 'wby':
return
print('wby pipeline')
kol = item['kol']
name = kol['accountName']
return item
在 settings 里配置
# settings.py
ITEM_PIPELINES = {
'sky_update.pipelines.SkyUpdatePipeline': 301, # 项目名称.pipelines.类名 小的优先度高
'sky_update.pipelines.WeiboyiCrawlPipeline': 300 # 项目名称.pipelines.类名
}
在 spider 爬虫文件里设置(推荐)
# 爬虫文件1
class WbySpider(scrapy.Spider):
name = 'wby'
custom_settings = {
'ITEM_PIPELINES': {'sky_update.pipelines.WeiboyiCrawlPipeline': 300}
}
# 爬虫文件2
class SkySpider(scrapy.Spider):
name = 'sky'
custom_settings = {
'ITEM_PIPELINES': {'sky_update.pipelines.SkyUpdatePipeline': 300}
}
settings.py
ITEM_PIPELINES = {
'sky_update.pipelines.SkyUpdatePipeline': 301, # 项目名称.pipelines.类名 小的优先度高
'sky_update.pipelines.WeiboyiCrawlPipeline': 300 # 项目名称.pipelines.类名
}
pipelines.py
class SkyUpdatePipeline:
def process_item(self, item, spider):
pass
class WeiboyiCrawlPipeline:
def process_item(self, item, spider):
pass