06.IP池设计

设计IP池:

  应用场景:

  1.防止网站IP检测,封掉IP,终止爬虫程序运行

  2.无痕浏览器 绕过非强制验证码问题

  3.防识别

设计思路:

  1.IP来源

  2.IP管理

    2.0.IP存活检测

  2.1.IP程序中管理

  3.IP应用

第一:

IP来源做法:(不管你的IP来源在哪(收费IP网站,免费IP网站,...提供IP资源的地方))

  源源不断的像提供IP的地方索取IP(这个过程我们不需要吝啬,一直运行即可)

2.IP管理做法:(提供两种方式 Redis池 或者 存入txt文档)

  将源源不断的IP存入到Redis池 或者txt文档

2.0.IP存活检测做法:

  时间 + 检测接口

2.1.IP程序管理:

  将IP放入队列中,针对多线程使用

3.IP应用

  将取出来的IP应用到requests中

 

第二:

代码实现

我这里把IP来源和IP管理写在了一起:

Redis池方式:

import redis,time, threading, random, requests, telnetlib, os
from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from queue import Queue


class IPS_():
    def __init__(self):
        # redis链接信息
        self.host = '123.59.207.171'
        self.port = 6379
        # redis 取出的结果默认是字节,设定 decode_responses=True 改成字符串
        self.decode_responses = True
        # 过期时间
        self.ex = 50
        self.password = 'amms..bridge'
        self.IpUrls = ['http://http1.9vps.com/getip.asp?username=13835372142&pwd=235b75eb472ee6e5afe3418a345773b3&geshi=1&fenge=1&fengefu=&getnum=100',
                       'http://http1.9vps.com/getip.asp?username=13835372142&pwd=784942f3cbc0a52493fd1d1e1764d0ee&geshi=1&fenge=1&fengefu=&getnum=100']
        self.Lock = threading.Lock()
        self.queue_ip = Queue()
        self.threadPoll = ThreadPoolExecutor(max_workers=20)


    def link(self):
        # 连接redis的操作
        # self.re = redis.Redis(host=self.host, password=self.password, port=self.port, db=0,decode_responses=self.decode_responses)  # decode_responses=True 自动解码,输出的结果自动由bytes类型变为字符串类型

        # 连接池的操作
        self.re_pool  = redis.Redis(connection_pool=redis.ConnectionPool(host=self.host, port=self.port, decode_responses=self.decode_responses, password = self.password))


    def thread_PullIP(self):
        # 两个子线程去访问付费ip网址,主线程继续往下执行。
        for ipurl in self.IpUrls:
            # ip_t = Thread(target=self.pullIP, args=(ipurl,))
            # ip_t.setDaemon(True)
            # ip_t.start()
            self.threadPoll.submit(self.pullIP, ipurl)


    def pullIP(self, pro_url):
        pwd = pro_url.split('pwd=')[1].split('&geshi')[0]
        lasttime = time.time()
        i = 1
        while True:
            try:
                # 若超过5秒 换ip
                sleeptime = time.time() - lasttime
                if sleeptime < 5:
                    time.sleep(5 - sleeptime)
                lasttime = time.time()

                ip = requests.get(pro_url, timeout=3).text
                if ip == 'false!error!请等待 5秒后再提取!':
                    print('false!error!请等待 5秒后再提取! -- {}'.format(pwd))
                    continue
                # ip存活检测 留下存活的ip 存入队列中
                telnetlib.Telnet(ip.split(':')[0], port=ip.split(':')[1], timeout=3)

                # 这里设置 i = 1是为了不让他执行100次跳入 i>100 的操作
                i = 1
                self.queue_ip.put(ip)
            except Exception as e:
                # too many request
                i += 1
                print('ERROR: -- {} -- {}'.format(e, pwd))

            if i > 100:
                print('ERROR:连续一百次获取不到ip')
                # 程序退出
                # os._exit(0) 程序无异常 程序退出
                # os._exit(1) 程序有异常 程序退出
                os._exit(1)


    def add(self):
        while True:
            if self.queue_ip.empty():
                time.sleep(5 / len(self.IpUrls))
            else:
                ip = self.queue_ip.get()
                if ip not in self.all():
                    time.sleep(5 / len(self.IpUrls))
                    print('添加ip: {}'.format(ip))
                    self.re_pool.set(ip, round(time.time()) + self.ex, ex=self.ex)
                    # self.threadPoll.submit(self.deleteRegularly, ip)
                else:
                    print('ip重复: {}'.format(ip))
                self.queue_ip.task_done()


    def run(self):
        # server
        self.link()
        self.thread_PullIP()
        self.add()

    def all(self):
        # 获得所有ip
        return self.re_pool.keys()



if __name__ == '__main__':

    IPS_().run()

在我们的说明下,这个py代码是一直保持运行的状态。

 

IP存活检测:

import redis, time, sys, os


class IPS_():
    def __init__(self):
        # redis链接信息
        self.host = '123.59.207.171'
        self.port = 6379
        # redis 取出的结果默认是字节,设定 decode_responses=True 改成字符串
        self.decode_responses = True
        # 过期时间
        self.ex = 50
        self.password = 'amms..bridge'
        self.re_pool = redis.Redis(connection_pool=redis.ConnectionPool(host=self.host, port=self.port, decode_responses=self.decode_responses,
                                    password=self.password), health_check_interval = 30)
        self.lastip = ''
        self.lasttime = time.time()
        self.sleeptime_ = 2.5


    def one(self):
        # 获得一个ip
        while True:
            sleeptime = time.time() - self.lasttime
            if sleeptime < self.sleeptime_:
                time.sleep(self.sleeptime_ - sleeptime)
            self.lasttime = time.time()

            try:
                keys = self.re_pool.keys()
                if keys == []:
                    print('ERROR: --- IP池为空,检查IP池')
                    os._exit(1)
                values = self.re_pool.mget(keys)
                ip = keys[values.index(max(['0' if value == None else value for value in values]))]
                if ip == self.lastip:
                    # print('INFO: ---- ip重复,重新获取中')
                    continue
                self.lastip = ip
                return ip
            except Exception as e:
                print('ERROR: ---- 延时10秒,重新链接,检查网络  -- 报错信息 - {}'.format(e))
                time.sleep(10)
                self.re_pool = redis.Redis(connection_pool=redis.ConnectionPool(host=self.host, port=self.port,
                                                                                decode_responses=self.decode_responses,
                                                                                password=self.password))

    def all(self):
        # 获得所有ip
        return self.re_pool.keys()


    def test(self):
        # cli
        while True:
            print(self.one())
            # print('\n')


if __name__ == '__main__':
    IPS_().test()

 

IP应用:

import sys
# 此路径是你编写IP来源以及IP管理的路劲
sys.path.append("D:\Work\IPS")
from redis_cli import IPS_
ips = IPS_()


def getpro():
    while True:
        ip = ips.one()
        return ip

if __name__ == '__main__':
    while True:
        print(getpro())

 

额外加IP程序内的管理:

import sys
sys.path.append("D:\JR\jr\ZKGIT\IPS")
from redis_cli import IPS_
ips = IPS_()
import time
from concurrent.futures import ThreadPoolExecutor
from threading import Lock, Thread
from queue import Queue

class First(object):
    def __init__(self):
        self.lock1 = Lock()
        self.ip_queue = Queue()

    def put_ip(self):
        old_ip = ''
        while True:
            ip = ips.one()
            # 如果上次取的IP和本次相同,跳过
            if old_ip == ip:
                continue
            self.ip_queue.put(ip)

    def get_ip(self):
        while True:
            try:
                # 多线程 防止争抢IP上锁
                self.lock1.acquire()
                ip = self.ip_queue.get()
                self.lock1.release()
            except:
                time.sleep(2)
                continue
            return ip

    def function(self,threadpool):
        ip = self.get_ip()
        print(ip)

if __name__ == '__main__':
    first = First()
    threadpool = ThreadPoolExecutor(max_workers=10)
    t = Thread(target=first.put_ip, args=())
    t.setDaemon(True)
    t.start()
    first.function(threadpool)

结束。

 

 

 

 

  

posted @ 2022-03-04 15:11  锋芒毕露的蜘蛛  阅读(250)  评论(0编辑  收藏  举报