Scrapy突破反爬虫的限制
随机切换UserAgent
https://github.com/hellysmile/fake-useragent
scrapy使用fake-useragent
在全局配置文件中禁用掉默认的UA,将其设置为None即可
settings.py
DOWNLOADER_MIDDLEWARES = {
...
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
}
在中间件中编写自己的middleware
middlewares.py
class RandomUserAgentMiddleware(object):
def __init__(self, crawler):
super(RandomUserAgentMiddleware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
将自己写的middleware配置进settings中
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.CustomDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
}
随机切换IP
https://github.com/scrapy-plugins/scrapy-crawlera
爬取西刺IP代理网站获取IP
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.html import remove_tags
from ArticleSpider.items import ProxyIpItemLoader, ProxyIpItem
class ProxyIpSpider(CrawlSpider):
name = 'proxy'
allowed_domains = ['www.xicidaili.com']
start_urls = ['http://www.xicidaili.com']
rules = (
Rule(LinkExtractor(allow=('nn/\d*')), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
ip_list = response.css('#ip_list tr')
for ipdata in ip_list[1:]:
item_loader = ProxyIpItemLoader(item=ProxyIpItem(), response=response)
data = ipdata.css('td')
item_loader.add_value('ip', data[1].css('td::text').extract_first())
item_loader.add_value('port', data[2].css('td::text').extract_first())
item_loader.add_value('addr', self.get_addr(data[3]))
item_loader.add_value('ishidden', data[4].css('td::text').extract_first())
item_loader.add_value('type', data[5].css('td::text').extract_first())
item_loader.add_value('speed', data[6].css('div::attr(title)').extract_first())
item_loader.add_value('conn_time', data[7].css('div::attr(title)').extract_first())
item_loader.add_value('live_time', data[8].css('td::text').extract_first())
item_loader.add_value('check_time', data[9].css('td::text').extract_first())
proxy_ip_item = item_loader.load_item()
yield proxy_ip_item
def get_addr(self, value):
if value.css('a'):
return remove_tags(value.extract()).strip()
else:
return "未知"
对数据进行简单处理
class ProxyIpItemLoader(ItemLoader):
default_output_processor = TakeFirst()
def live_time(value):
'''
分钟, 小时, 天
统一转换成分钟
'''
if '分钟' in value:
return int(value.split('分钟')[0])
elif '小时' in value:
value = value.split('小时')[0]
return int(value) * 60
elif '天' in value:
value = value.split('天')[0]
return int(value) * 60 * 24
def ishidden_to_int(value):
if '高匿' in value:
return int(1)
else:
return int(0)
def check_time(value):
return datetime.datetime.strptime(value, "%y-%m-%d %H:%M")
class ProxyIpItem(scrapy.Item):
'''
{'addr': '陕西西安',
'check_time': '12-12-31 18:52',
'conn_time': '0.82秒',
'ip': '113.133.160.203',
'ishidden': '高匿',
'live_time': '1分钟',
'port': '6675',
'speed': '3.595秒',
'type': 'socks4/5'
}
'''
ip = scrapy.Field()
port = scrapy.Field()
addr = scrapy.Field(
input_processor = MapCompose(remove_tags, lambda x:x.strip())
)
ishidden = scrapy.Field(
input_processor=MapCompose(ishidden_to_int)
)
type = scrapy.Field()
speed = scrapy.Field()
conn_time = scrapy.Field()
live_time = scrapy.Field(
input_processor = MapCompose(live_time)
)
check_time = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into proxy_ip(ip, port, addr, ishidden, type, speed, conn_time, live_time, check_time)
VALUES (%s, %s, %s, %s,%s,%s, %s, %s, %s)
"""
params = (self["ip"], self["port"], self["addr"],
self["ishidden"], self["type"],self["speed"],
self["conn_time"], self["live_time"], self["check_time"])
return insert_sql, params
在pipeline中进行数据的再次清洗,抛弃所有的特殊端口的item,并数据进行保存
在中间件中创建切换IP的中间件,在主配置文件中启用这个中间件
IP是否可用,只需要请求百度即可
验证码识别
没必要自己写一个验证码识别代码
可以使用云打码平台进行验证码识别
http://www.yundama.com/
需要分别注册一个普通用户和一个开发者账号
下载pythonhttp版本
http://www.yundama.com/apidoc/YDM_SDK.html#DLL
解压后里面有一个3.x的文件,打开后进行配置
# 用户名(普通用户)
username = 'username'
# 密码(普通用户)
password = 'password'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 1
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = '22cc5376925e9387a23cf797cb9ba745'
# 图片文件
filename = 'getimage.jpg'
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 1004
# 超时时间,秒
timeout = 60