28构建IP池与从csv中提取ip
1.技巧
打印的字标红
# \033[31m可用\033[0m 例如: a = 1 print(a,'\033[31m变红\033[0m')
2.IP池 与 IP提取
ip池
# coding=gbk # -*- coding:uft-8 -*- # @Time: 2022/12/18 # @Author: 十架bgm # @FileName: IP池12-18 import requests #导入模块 from lxml import etree from faker import Factory Fact =Factory.create() ua = Fact.user_agent() def request_header(): headers = { 'User-Agent': ua } return headers ''' 创建两个列表用来存放代理ip ''' all_ip_list = [] #用于存放从网站上抓取到的ip usable_ip_list = [] #用于存放通过检测ip后是否可以使用 #发送请求,获得响应 def send_request(): #爬取7页,可自行修改 for i in range(1,8): print(f'正在抓取第{i}页……') response = requests.get(url=f'http://www.ip3366.net/free/?page={i}', headers=request_header()) # text = response.text.encode('ISO-8859-1') text = response.text # print(text.decode('gbk')) #使用xpath解析,提取出数据ip,端口 html = etree.HTML(text) tr_list = html.xpath('/html/body/div[2]/div/div[2]/table/tbody/tr') for td in tr_list: ip_ = td.xpath('./td[1]/text()')[0] #ip port_ = td.xpath('./td[2]/text()')[0] #端口 proxy = ip_ + ':' + port_ #115.218.5.5:9000 all_ip_list.append(proxy) test_ip(proxy) #开始检测获取到的ip是否可以使用 print('抓取完成!') print(f'抓取到的ip个数为:{len(all_ip_list)}') print(f'可以使用的ip个数为:{len(usable_ip_list)}') print('分别有:\n', usable_ip_list) #检测ip是否可以使用 def test_ip(proxy): #构建代理ip proxies = { "http": "http://" + proxy, "https": "http://" + proxy, # "http": proxy, # "https": proxy, } try: response = requests.get(url='https://www.baidu.com/',headers=request_header(),proxies=proxies,timeout=1) #设置timeout,使响应等待1s response.close() if response.status_code == 200: usable_ip_list.append(proxy) print(proxy, '\033[31m可用\033[0m') else: print(proxy, '不可用') except: print(proxy,'请求异常') if __name__ == '__main__': send_request() for usable_ip in usable_ip_list: with open('usableIP.csv', 'a', encoding='utf-8') as f: f.write(usable_ip + '\n')
ip提取(从csv文件提取)
# coding=gbk # -*- coding:uft-8 -*- # @Time: 2022/12/19 # @Author: 十架bgm # @FileName: 从csv文件提取ip import random import pandas as pd import requests from faker import Factory Fact =Factory.create() ua = Fact.user_agent() # data = pd.read_csv('./爬虫练习/自动获取海量ip/usableIP.csv') data = pd.read_csv('./usableIP.csv') # 读取csv文件 # print(data) ips = data['IP'] # 只提取'IP'这列的数据 ip_list = [] for i in ips: ip_list.append(i) print(f'ip池:{ip_list}') print(f'ip数量为:{len(ip_list)}') ip = ip_list[random.randint(0,len(ip_list)-1)] # 随机提取 print(f'随机抽取的id为:{ip}') headers = { 'User-Agent': ua } proxies = { "http": ip, "https": ip, # "http": proxy, # "https": proxy, } response = requests.get(url='https://api.ipify.org/?format=json',headers=headers,proxies=proxies,timeout=1) print(f'访问网站“https://api.ipify.org/?format=json”的ip是:{response.text}')
本文来自博客园,作者:__username,转载请注明原文链接:https://www.cnblogs.com/code3/p/16991369.html
分类:
标签:
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步