scrapy版本爬取某网站,加入了ua池,ip池,不限速不封号,100个线程爬崩网站
scrapy版本爬取妹子图
不封号,不限速,无限爬取
关键所在下载图片
from scrapy.pipelines.images import ImagesPipeline
原来的类继承object,改为继承ImagesPipeline
其中 函数名固定
def get_media_requests # 下载图片
def item_completed # 是否下载成功
def file_path # 图片存放
前期准备
代理ip池
import pymysql
import random
def get_ip():
conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='123',
database='pachong',
charset='utf8',
autocommit=True
)
cursor = conn.cursor(pymysql.cursors.DictCursor)
sql = 'select ip from ip_list'
cursor.execute(sql)
ip_list = cursor.fetchall()
ip = random.choice(ip_list)['ip']
conn.close()
return ip
UserAgent池
def get_UserAgent():
from fake_useragent import UserAgent
ua = UserAgent(verify_ssl=False)
UserAgent = ua.random # 随机获取一个UserAgent
return UserAgent
middlewares中间件(破解反爬)
from mzitu.pool.ip import get_ip
from mzitu.pool.useragent import get_UserAgent
def process_request(self, request, spider):
request.meta['proxies'] = get_ip()
request.headers['User-Agent'] = get_UserAgent()
request.headers['Referer'] = 'https://www.mzitu.com/'
return None
settings配置
LOG_LEVEL='ERROR'
IMAGES_STORE = 'E:\python13\pachong\images' # 下载目录配置,没有自动创建
RETRY_ENABLED = False # 禁止重试
DOWNLOAD_TIMEOUT = 10 # 超时时间放弃
# 打开下载
ITEM_PIPELINES = {
'mzitu.pipelines.MzituPipeline': 300,
}
# 打开中间件
DOWNLOADER_MIDDLEWARES = {
'mzitu.middlewares.MzituDownloaderMiddleware': 543,
}
正题
爬虫
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from mzitu.items import MzituItem
class AmzituSpider(scrapy.Spider):
name = 'Amzitu'
start_urls = ['https://www.mzitu.com/197251']
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
img_url = soup.select('.main-image img')[0].attrs['src']
next = soup.select('.pagenavi a:nth-last-child(1)')[0].attrs['href']
img_name = img_url.rsplit('/',1)[-1]
item = MzituItem()
item['img_url'] = img_url
item['img_name'] = img_name
yield item
yield Request(next)
保存下载图片
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
class MzituPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
print(item['img_url'])
# 下载图片,如果传过来的是集合需要循环下载
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
yield Request(url=item['img_url'], meta={'name': item['img_name']})
def item_completed(self, results, item, info):
# 是一个元组,第一个元素是布尔值表示是否成功
if not results[0][0]:
raise DropItem('下载失败')
return item
# 重命名图片名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
def file_path(self, request, response=None, info=None):
img_name = request.meta['name']
return img_name
选择了IT,必定终身学习