Python爬虫之反爬虫(随机user-agent,获取代理ip,检测代理ip可用性)

python爬虫之反爬虫(随机user-agent,获取代理ip,检测代理ip可用性)

目录

1
2
3
4
5
随机User-Agent
 
获取代理ip
 
检测代理ip可用性

  

 

 

 

 

随机User-Agent

fake_useragent库,伪装请求头

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from fake_useragent import UserAgent
 
ua = UserAgent()
# ie浏览器的user agent
print(ua.ie)
 
# opera浏览器
print(ua.opera)
 
# chrome浏览器
print(ua.chrome)
 
# firefox浏览器
print(ua.firefox)
 
# safri浏览器
print(ua.safari)
 
# 最常用的方式
# 写爬虫最实用的是可以随意变换headers,一定要有随机性。支持随机生成请求头
print(ua.random)
print(ua.random)
print(ua.random)

 

 

 

 

获取代理ip

在免费的代理网站爬取代理ip,免费代理的采集也很简单,无非就是:访问页面页面 —> 正则/xpath提取 —> 保存

1
2
3
4
5
代理ip网站
有代理:https://www.youdaili.net/Daili/guonei/
66代理:http://www.66ip.cn/6.html
西刺代理:https://www.xicidaili.com/
快代理:https://www.kuaidaili.com/free/

 

 

1
2
#根据网页结果,适用正则表达式匹配
#这种方法适合翻页的网页
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import re
import requests
import  time
 
def get_ip():
    url='https://www.kuaidaili.com/free/inha/'
    url_list=[url+str(i+1) for i in range(5)] #生成url列表,5代表只爬取5页
    print(url_list)
    ip_list = []
    for i in range(len(url_list)):
        url =url_list[i]
        html = requests.get(url=url,).text
        regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
        matcher = re.compile(regip,re.S)
        ipstr = re.findall(matcher, html)
        time.sleep(1)
 
        for j in ipstr:
            ip_list.append(j[0]+':'+j[1])  #ip+port
    print(ip_list)
    print('共收集到%d个代理ip' % len(ip_list))
    return ip_list
if __name__=='__main__':
    get_ip()

  

 

 

 

 

 

 

1
2
#先获取特定标签
#解析
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests
from bs4 import BeautifulSoup
def get_ip_list(obj):
    ip_text = obj.findAll('tr', {'class': 'odd'})   # 获取带有IP地址的表格的所有行
    ip_list = []
    for i in range(len(ip_text)):
        ip_tag = ip_text[i].findAll('td')  
        ip_port = ip_tag[1].get_text() + ':' + ip_tag[2].get_text() # 提取出IP地址和端口号
        ip_list.append(ip_port)
    print("共收集到了{}个代理IP".format(len(ip_list)))
    print(ip_list)
    return ip_list
url = 'http://www.xicidaili.com/'
headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
request = requests.get(url, headers=headers)
response =request.text
bsObj = BeautifulSoup(response, 'lxml')     # 解析获取到的html
lists=get_ip_list(bsObj)

 

 

 

 

 

检测代理ip可用性

第一种方法:通过返回的状态码判断

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
import random
import re
import time
 
 
 
def get_ip():
    url='https://www.kuaidaili.com/free/inha/'
    url_list=[url+str(i+1) for i in range(1)]
    print(url_list)
    ip_list = []
    for i in range(len(url_list)):
        url =url_list[i]
        html = requests.get(url=url,).text
        regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
        matcher = re.compile(regip,re.S)
        ipstr = re.findall(matcher, html)
        time.sleep(1)
 
        for j in ipstr:
            ip_list.append(j[0]+':'+j[1])
    print('共收集到%d个代理ip' % len(ip_list))
    print(ip_list)
    return ip_list
def valVer(proxys):
    badNum = 0
    goodNum = 0
    good=[]
    for proxy in proxys:
        try:
            proxy_host = proxy
            protocol = 'https' if 'https' in proxy_host else 'http'
            proxies = {protocol: proxy_host}
            print('现在正在测试的IP:',proxies)
            response = requests.get('http://www.baidu.com', proxies=proxies, timeout=2)
            if response.status_code != 200:
                badNum += 1
                print (proxy_host, 'bad proxy')
            else:
                goodNum += 1
                good.append(proxies)
                print (proxy_host, 'success proxy')
        except Exception as e:
            print( e)
            # print proxy_host, 'bad proxy'
            badNum += 1
            continue
    print ('success proxy num : ', goodNum)
    print( 'bad proxy num : ', badNum)
    print(good)
 
if __name__ == '__main__':
    ip_list=get_ip()
    valVer(ip_list)

  

 

 

 

 

第二种方法:使用requests包来进行验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
import random
import re
import time
 
 
 
def get_ip():
    url='https://www.kuaidaili.com/free/inha/'
    url_list=[url+str(i+1) for i in range(1)]
    print(url_list)
    ip_list = []
    for i in range(len(url_list)):
        url =url_list[i]
        html = requests.get(url=url,).text
        regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
        matcher = re.compile(regip,re.S)
        ipstr = re.findall(matcher, html)
        time.sleep(1)
 
        for j in ipstr:
            ip_list.append(j[0]+':'+j[1])
    print(ip_list)
    print('共收集到%d个代理ip' % len(ip_list))
    return ip_list
def valVer(proxys):
    badNum = 0
    goodNum = 0
    good=[]
    for proxy in proxys:
        print("现在正在检测ip",proxy)
        try:
            requests.get('http://wenshu.court.gov.cn/', proxies={"http":"http://"+str(proxy)}, timeout=2)
        except:
            badNum+=1
            print('connect failed')
        else:
            goodNum=1
            good.append(proxy)
            print('success')
 
    print ('success proxy num : ', goodNum)
    print( 'bad proxy num : ', badNum)
    print(good)
 
if __name__ == '__main__':
    ip_list=get_ip()
    valVer(ip_list)

 

 

 

 

第三种方法:使用telnet

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import telnetlib
 
 
try:
 
    telnetlib.Telnet('127.0.0.1', port='80', timeout=20)
 
except:
 
    print 'connect failed'
 
else:
     
    print 'success'

  

posted @   -零  阅读(4243)  评论(0编辑  收藏  举报
编辑推荐:
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?
点击右上角即可分享
微信分享提示