Flask开发系列之Flask+redis实现IP代理池
Flask开发系列之Flask+redis实现IP代理池
代理池的要求
-
多站抓取,异步检测:多站抓取:指的是我们需要从各大免费的ip代理网站,把他们公开的一些免费代理抓取下来;一步检测指的是:把这些代理通过异步请求的方式,利用这些代理请求网站:如果能正常请求就证明代理可用,如果不能正常请求就证明代理不行,这时就可以把这个代理剔除掉,异步指的是:我们不需要一直等待代理请求网站,到得到response之后在执行相应的操作就可以了,异步可以提高检测效率。
-
定时筛选,持续更新:我们维护一个代理池,我们需要做的是需要定时从里面拿出一部分来检测,剔除掉不可用的代理。这可以保证代理是可用的
-
提供接口,易于提取:代理实际上是维护在一个队列中,队列可以使用数据库存储,也可以使用一些数据结构来存储,但是如果要获取代理的话,要提供一个简单的接口,最简单的是web形式的接口:本文主要演示一个利用python flask包来提供接口:之后使用python请求网址,从网页中拿到代理的信息了
代理池的架构
-
获取器:从各大网站平台抓取代理:ip和端口
-
过滤器:剔除掉不可用的代理
-
将可用代理放到代理队列
-
定时检测器:剔除不可用的代理
-
API:通过接口形式拿到代理对象,方便使用
测试实现版
import requests import re import time import redis from bloom_filter import BloomFilter import ast pool = redis.ConnectionPool(host='localhost',password='xxx', port=6379, decode_responses=True) r = redis.Redis(connection_pool=pool) bloombloom = BloomFilter(max_elements=10000, error_rate=0.1) bloombloom.add(str({'http': '117.91.232.53:9999'})) def get_ip(i): ip_list=[] url = 'https://www.kuaidaili.com/free/inha/' url = url + str(i + 1) html = requests.get(url=url, ).text regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>' matcher = re.compile(regip, re.S) ipstr = re.findall(matcher, html) time.sleep(1) for j in ipstr: ip_list.append(j[0] + ':' + j[1]) print('共收集到%d个代理ip' % len(ip_list)) print(ip_list) return ip_list def valVer(proxys): global badNum,goodNum,good_list good = [] for proxy in proxys: try: proxy_host = proxy protocol = 'https' if 'https' in proxy_host else 'http' proxies = {protocol: proxy_host} print('现在正在测试的IP:', proxies) response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2) if response.status_code != 200: badNum += 1 print(proxy_host, 'bad proxy') else: goodNum += 1 good.append(proxies) good_list.append(proxies) print(proxy_host, 'success proxy') except Exception as e: print(e) # print proxy_host, 'bad proxy' badNum += 1 continue print('success proxy num : ', goodNum) print('bad proxy num : ', badNum) print("这次:",good) print("此时全部:",good_list) return good def time_valVer(proxys): good = [] for proxy in proxys: try: print('现在正在定时测试的IP:',proxy) proxy = ast.literal_eval(proxy) response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2) if response.status_code != 200: r.lrem("ip_list", proxy, 1) print(proxy, 'bad proxy') else: good.append(proxy) good_list.append(proxy) print(proxy, 'success proxy') except Exception as e: print(e) continue def stone(good): for IP in good: if str(IP) in bloombloom: print("%s不能存储,有相同的IP",IP) continue else: print("存储的IP:", IP) bloombloom.add(str(IP)) r.rpush("ip_list", str(IP)) if __name__ == '__main__': badNum = 0 goodNum = 0 good_list = [] for i in range(0,10): if i%10 == 0 and i!=0: proxy_list = [] for i in range(0, r.llen("ip_list")): proxy_list.append(r.lindex("ip_list", i)) time_valVer(proxy_list) else: ip_list = get_ip(i) good = valVer(ip_list) stone(good)
from flask import Flask import redis # 导入redis模块,通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库 r = redis.Redis(host='localhost', port=6379,password='xxx',decode_responses=True) app = Flask(__name__) @app.route('/ip/<int:index>') def reponse(index): print(index) print(r.lindex("ip_list", index)) return r.lindex("ip_list", index) if __name__ == '__main__': app.run(debug=True)
获取ip:
改进版
import requests import re import time import redis from bloom_filter import BloomFilter import ast pool = redis.ConnectionPool(host='localhost',password='XXX', port=6379, decode_responses=True) r = redis.Redis(connection_pool=pool) bloombloom = BloomFilter(max_elements=10000, error_rate=0.1) def get_ip(i): ip_list=[] url = 'https://www.kuaidaili.com/free/inha/' url = url + str(i + 1) html = requests.get(url=url, ).text regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>' matcher = re.compile(regip, re.S) ipstr = re.findall(matcher, html) time.sleep(1) for j in ipstr: ip_list.append(j[0] + ':' + j[1]) print('共收集到%d个代理ip' % len(ip_list)) print(ip_list) return ip_list def valVer(proxys): global badNum,goodNum,good_list good = [] for proxy in proxys: try: proxy_host = proxy protocol = 'https' if 'https' in proxy_host else 'http' proxies = {protocol: proxy_host} response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2) if response.status_code != 200: badNum += 1 else: goodNum += 1 good.append(proxies) good_list.append(proxies) except Exception as e: print(e) badNum += 1 continue print('success proxy num : ', goodNum) print('bad proxy num : ', badNum) print("这次:",good) print("此时全部:",good_list) return good def time_valVer(proxys): for proxy in proxys: try: print('现在正在定时测试的IP:',proxy) proxy = ast.literal_eval(proxy) response = requests.get('http://www.baidu.com', proxies=proxy, timeout=2) if response.status_code != 200: r.lrem("ip_list", proxy, 1) except Exception as e: print(e) continue def stone_redis(good): for IP in good: if str(IP) in bloombloom: print("%s不能存储,有相同的IP",IP) continue else: print("存储的IP:", IP) bloombloom.add(str(IP)) r.rpush("ip_list", str(IP)) def init(): for i in range(0, r.llen("ip_list")): print(r.lindex("ip_list", i)) bloombloom.add(r.lindex("ip_list", i)) if __name__ == '__main__': badNum = 0 goodNum = 0 good_list = [] init() for i in range(0,10): if i%2 == 0 and i!=0: proxy_list = [] for i in range(0, r.llen("ip_list")): proxy_list.append(r.lindex("ip_list", i)) time_valVer(proxy_list) else: ip_list = get_ip(i) good = valVer(ip_list) stone_redis(good)
from flask import Flask, abort, request, jsonify import redis # 导入redis模块,通过python操作redis 也可以直接在redis主机的服务端操作缓存数据库 r = redis.Redis(host='localhost', port=6379,password='XXX',decode_responses=True) app = Flask(__name__) @app.route('/ip/<int:index>', methods=['GET']) def reponse(index): print(index) ip = {"ip":r.lindex("ip_list", index)} print(r.lindex("ip_list", index)) return jsonify(ip) if __name__ == '__main__': app.run(debug=True)
获取ip: