python 3.7.5 Scrapy 架构中的代理IP和随机User-Agent 配置

基本上不需要修改原有代码,添加如下代码即可。

注:在settings.py 中需要注释点原有的 USER_AGENT 配置。

###############
##settings.py##
###############

### 配置IP代理池
IPPOOL = [
{"ipaddr":"x.x.x.x:端口"},
{"ipaddr":"x.x.x.x:端口"},
]

### 配置User-agent 池
UAPOOL = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)"
]

###配置DOWNLOADER_MIDDLEWARES,注意此处需要修改为自己的爬虫名称
DOWNLOADER_MIDDLEWARES = {
'自己的爬虫名称.middlewares.IPPOOLS': 125,
'自己的爬虫名称.middlewares.Uamid': 127,
}

######################
##配置 middlewares.py##
######################

import random from .settings import IPPOOL from .settings import UAPOOL from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware ### IP代理池 class IPPOOLS(HttpProxyMiddleware): def __init__(self, ip=""): self.ip = ip def process_request(self, request, spider): thisip = random.choice(IPPOOL) #print("当前使用的IP为: " + thisip["ipaddr"]) request.meta["proxy"] = "http://" + thisip["ipaddr"] ### 用户代理池 class Uamid(UserAgentMiddleware): def __init__(self, user_agent=""): self.user_agent = user_agent def process_request(self, request, spider): thisua = random.choice(UAPOOL) #print("当前使用的User-Agent是: " + thisua) request.headers.setdefault("User-Agent", thisua)

 

posted @ 2022-04-08 11:10  语~默  阅读(98)  评论(0编辑  收藏  举报