scrapy 常用代码
Spider
spider中setting配置的获取
self.settings.get('QLPQMS') #不能__init__中获取setting配置,setting配置要大写字母
请求与响应之间通过meta
传参
yield scrapy.http.Request(url,
callback=self.callback,
meta={'page': 'aaa'},
priority=优先级数值
)
通过response.meta
获取传参
page = response.meta.get('page')
启动参数获取
class JsSpider01Spider(scrapy.Spider):
name = 'xxxxxxx'
allowed_domains = []
start_urls = []
def __init__(self, 参数, *args, **kwargs):
super().__init__(*args, **kwargs)
self.参数 = 参数
Pipelines
将pipe
赋给spider
以便共享属性 --> 测试可行
from peewee import *
database = MySQLDatabase(database=settings.get('DATABASE_NAME'),
user=settings.get('DATABASE_USER'),
password=settings.get('DATABASE_PASSWORD'),
host=settings.get('DATABASE_HOST'),
)
class js_SpiderPipeline:
def open_spider(self, spider):
self.js_database = database
self.js_database.connect()
self.spider = spider
self.spider.pipe = self
对比 数据库查询 与 item数据
item数据爬取无处理,基本为文本格式,写入数据库后提取数据类型变化,对比写入数据库需要再行对比
from datetime import datetime, date
from decimal import Decimal
def 对比数据(self, q_data, data):
update_dict = {}
for k, v in data.items():
q_v = q_data[k]
if q_v == v or v == '' or v is None or v == 'null':
continue
if isinstance(q_v, int) or isinstance(q_v, float) or isinstance(q_v, Decimal):
if float(v) == float(q_v) or Decimal(q_v) == Decimal(v):
continue
update_dict[k] = data[k]
if isinstance(q_v, datetime) or isinstance(q_v, date) or isinstance(v, datetime):
q_v_ = str(q_v)
v_ = str(v)
if q_v_ == v_:
continue
update_dict[k] = data[k]
if q_v != v:
update_dict[k] = data[k]
return update_dict
Middleware
检查经过DownloadMiddleware的响应并重新发送请求
class ResposeCheckDownloadMiddleware:
def process_response(self, request, response, spider):
if r'</body>' not in response.text and r'</BODY>' not in response.text:
# response 响应不全,仅半截网页
spider.log(f'爬取内容不全,重新请求-->{response.url}', level=logging.INFO)
request.meta['priority'] = 100000000
request.dont_filter = True
return request
if 'administrator' in response.text:
# 免费代理返回内容非预期
spider.log(f'响应包含 administrator 字符,返回并重新请求-->{response.url}', level=logging.INFO)
request.dont_filter = True
return request
elif response.status == 404:
spider.log(f'请求404,返回并重新请求-->{response.url}', level=logging.INFO)
request.meta['priority'] = 100000000 # 数值越大越优先,但是容易卡在这个请求
request.dont_filter = True # 重新发送请求需要禁止去重
return request
return response
中间件更换请求头
from random import choice
class UserAgentMiddleware:
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
]
def process_request(self, request, spider):
user_agent = choice(self.user_agent)
request.headers['User-Agent'] = user_agent
request.headers['accept-language'] = 'zh-Hans-CN,zh-Hans;q=0.5'
request.headers['accept-encoding'] = 'gzip, deflate'
request.headers['accept'] = 'text/html, application/xhtml+xml, image/jxr, */*'
请求添加代理
使用动态隧道ip代理的时候,隧道代理要求设置request.headers['Connection'] = 'close'
class ProxyMiddleware(object):
def process_request(self, request, spider):
代理 = f'http://xxx.xxx.xxx.xxx:xxxx'
request.meta["proxy"] = 代理
setting
setting.py中自定义配置信息需要大写字母
常用配置有
LOG_LEVEL = "INFO"
RETRY_ENABLED = True # 默认开启失败重试,一般关闭
RETRY_TIMES = 3 # 失败后重试次数,默认两次
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408] # 碰到这些验证码,才开启重试
ROBOTSTXT_OBEY = False # 机器人协议
CONCURRENT_REQUESTS = 1 # 请求并发数
DOWNLOAD_DELAY = 0.2 # 请求间隔时间 无论多少并发,一个一个排队
COOKIES_ENABLED = False # 禁用cookies
# 中间件配置启用
DOWNLOADER_MIDDLEWARES = {
# 'xxxxxxx.middlewares.JsSpiderDownloaderMiddleware': 543,
'xxxxxxx.middlewares.UserAgentMiddleware': 543,
# 'xxxxxxx.middlewares.ProxyMiddleware': 544,
'xxxxxxx.middlewares.ResposeCheckDownloadMiddleware': 545,
}
# 数据管道配置启用
ITEM_PIPELINES = {
# 'xxxxxx.pipelines.xxxxxxxPipeline': 300,
'xxxxxx.pipelines.xxxxxxxPipeline': 301,
}
调试脚本
from scrapy import cmdline
print('爬虫开始==========================================================>')
cmdline.execute(f'''scrapy crawl xxxxxxx -a 参数=value'''.split())
print('爬虫结束==========================================================>')
出处: https://www.cnblogs.com/meizhengchao/p/16987760.html
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出, 原文链接 如有问题, 可邮件(meizhengchao@qq.com)咨询.