scrapy 常用代码

Spider

spider中setting配置的获取

self.settings.get('QLPQMS')   #不能__init__中获取setting配置,setting配置要大写字母

请求与响应之间通过meta传参

yield scrapy.http.Request(url,
                          callback=self.callback, 
                          meta={'page': 'aaa'},
                          priority=优先级数值
  )

通过response.meta获取传参

page = response.meta.get('page')

启动参数获取

class JsSpider01Spider(scrapy.Spider):
    name = 'xxxxxxx'
    allowed_domains = []
    start_urls = []

    def __init__(self, 参数, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.参数 = 参数

Pipelines

pipe赋给spider 以便共享属性 --> 测试可行

from peewee import *
database = MySQLDatabase(database=settings.get('DATABASE_NAME'),
                         user=settings.get('DATABASE_USER'),
                         password=settings.get('DATABASE_PASSWORD'),
                         host=settings.get('DATABASE_HOST'),
                         )
class js_SpiderPipeline:
    def open_spider(self, spider):
        self.js_database = database
        self.js_database.connect()
        self.spider = spider  
        self.spider.pipe = self

对比 数据库查询 与 item数据

item数据爬取无处理,基本为文本格式,写入数据库后提取数据类型变化,对比写入数据库需要再行对比

from datetime import datetime, date
from decimal import Decimal
def 对比数据(self, q_data, data):
    update_dict = {}
    for k, v in data.items():
        q_v = q_data[k]
        if q_v == v or v == '' or v is None or v == 'null':
            continue
        if isinstance(q_v, int) or isinstance(q_v, float) or isinstance(q_v, Decimal):
            if float(v) == float(q_v) or Decimal(q_v) == Decimal(v):
                continue
            update_dict[k] = data[k]
        if isinstance(q_v, datetime) or isinstance(q_v, date) or isinstance(v, datetime):
            q_v_ = str(q_v)
            v_ = str(v)
            if q_v_ == v_:
                continue
            update_dict[k] = data[k]
        if q_v != v:
            update_dict[k] = data[k]
    return update_dict

Middleware

检查经过DownloadMiddleware的响应并重新发送请求

class ResposeCheckDownloadMiddleware:
    def process_response(self, request, response, spider):

        if r'</body>' not in response.text and r'</BODY>' not in response.text:
            # response 响应不全,仅半截网页
            spider.log(f'爬取内容不全,重新请求-->{response.url}', level=logging.INFO)
            request.meta['priority'] = 100000000
            request.dont_filter = True
            return request

        if 'administrator' in response.text:
            # 免费代理返回内容非预期
            spider.log(f'响应包含 administrator 字符,返回并重新请求-->{response.url}', level=logging.INFO)
            request.dont_filter = True
            return request

        elif response.status == 404:
            spider.log(f'请求404,返回并重新请求-->{response.url}', level=logging.INFO)
            request.meta['priority'] = 100000000   # 数值越大越优先,但是容易卡在这个请求
            request.dont_filter = True             # 重新发送请求需要禁止去重
            return request

        return response

中间件更换请求头

from random import choice
class UserAgentMiddleware:
    user_agent = [
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
        "UCWEB7.0.2.37/28/999",
        "NOKIA5700/ UCWEB7.0.2.37/28/999",
        "Openwave/ UCWEB7.0.2.37/28/999",
        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    ]

    def process_request(self, request, spider):
        user_agent = choice(self.user_agent)
        request.headers['User-Agent'] = user_agent
        request.headers['accept-language'] = 'zh-Hans-CN,zh-Hans;q=0.5'
        request.headers['accept-encoding'] = 'gzip, deflate'
        request.headers['accept'] = 'text/html, application/xhtml+xml, image/jxr, */*'

请求添加代理

使用动态隧道ip代理的时候,隧道代理要求设置request.headers['Connection'] = 'close'

class ProxyMiddleware(object):
    def process_request(self, request, spider):
        代理 = f'http://xxx.xxx.xxx.xxx:xxxx'
        request.meta["proxy"] = 代理

setting

setting.py中自定义配置信息需要大写字母
常用配置有

LOG_LEVEL = "INFO"
RETRY_ENABLED = True                                      # 默认开启失败重试,一般关闭
RETRY_TIMES = 3                                           # 失败后重试次数,默认两次
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408]    # 碰到这些验证码,才开启重试
ROBOTSTXT_OBEY = False   # 机器人协议
CONCURRENT_REQUESTS = 1  # 请求并发数
DOWNLOAD_DELAY = 0.2     # 请求间隔时间 无论多少并发,一个一个排队
COOKIES_ENABLED = False  # 禁用cookies

# 中间件配置启用
DOWNLOADER_MIDDLEWARES = {
    # 'xxxxxxx.middlewares.JsSpiderDownloaderMiddleware': 543,
    'xxxxxxx.middlewares.UserAgentMiddleware': 543,
     # 'xxxxxxx.middlewares.ProxyMiddleware': 544,
     'xxxxxxx.middlewares.ResposeCheckDownloadMiddleware': 545,
}

# 数据管道配置启用 
ITEM_PIPELINES = {
    # 'xxxxxx.pipelines.xxxxxxxPipeline': 300,
    'xxxxxx.pipelines.xxxxxxxPipeline': 301,
}

调试脚本

from scrapy import cmdline

print('爬虫开始==========================================================>')
cmdline.execute(f'''scrapy crawl xxxxxxx -a 参数=value'''.split())
print('爬虫结束==========================================================>')
posted @ 2022-12-16 17:33  meizhengchao  阅读(28)  评论(0编辑  收藏  举报