python例子-抓取网站IP列表
其中的html代码类似这样:
''' <li> <div class="ip">IP</div> <div class="port">端口</div> <div class="type">类型</div> <div class="addr" style="text-align:center;">地址</div> </li>, <li> <div class="ip">211.48.77.58</div> <div class="port">3128</div> <div class="type">透明</div> <div class="addr">韩国 CZ88.NET</div> </li> '''
以下代码可直接保存为xxx.py 执行
1 #!/usr/bin/python 2 3 #coding:utf-8 4 5 import re 6 import httplib 7 from bs4 import BeautifulSoup 8 9 def main(): 10 get_proxy() 11 12 def get_proxy(): 13 httpClient = httplib.HTTPConnection("www.cz88.net",80,False,5) #创建连接 14 httpClient.request("GET","/proxy/") #创建请求 15 response = httpClient.getresponse() #发起请求并返回HTTPResponse对象 16 body = response.read() 17 soup = BeautifulSoup(body) #BeautifulSoup封装返回文本 18 ul = soup.find_all('ul') #找到所有的ul标签(需提前在浏览器中分析你需要的内容在哪) 19 ipul = ul[4] #ip列表的li #其中下表为4的ul是我需要的含有ip列表的ul 20 ipli = ipul.find_all('li') #找到其中的li标签(一层一层往下解析) 21 ip_list = [] 22 port_list = [] 23 type_list = [] 24 addr_list = [] 25 for li in ipli[1:]: #为什么从1开始?下表为0的是我不需要的,你可以打印出来看看 26 for li_div in li.children: 27 class_temp = li_div.attrs['class'][0].encode('utf-8') 28 contents_temp = li_div.contents[0].encode('utf-8').strip() 29 if class_temp == 'ip': 30 ip_list.append(contents_temp) 31 elif class_temp == 'port': 32 port_list.append(contents_temp) 33 elif class_temp == 'type': 34 type_list.append(contents_temp) 35 elif class_temp == 'addr': 36 addr_list.append(contents_temp) 37 else: 38 pass 39 40 for ip,port in zip(ip_list,port_list): #其中没有打印type和addr,需要的自己打印 41 print ip+":"+port 42 43 #fw = open("test.html","w") #打开文件,以写的方式(如果没有就创建),为之前调试写入 44 #fw.write(body) #写入网站返回的文本(不包括头) 45 #fw.close() 46 httpClient.close() #关闭网站连接 47 48 if __name__ == '__main__': 49 main()
结果如下:
113.204.212.50:3128 181.211.191.227:8080 87.118.126.186:80 84.253.68.178:8080 200.96.113.234:8080 175.138.67.66:8888 120.195.200.90:80 218.201.21.177:80 117.169.14.81:8080 213.185.81.248:80 186.103.169.166:8080 211.138.124.216:80 218.92.227.171:21724 221.130.17.130:80 115.228.54.73:3128 180.179.118.85:3128 188.136.149.22:8080 221.206.210.195:3128 14.139.172.170:3128 217.198.115.61:80 110.176.169.214:3128 201.73.200.130:3128 195.46.211.63:80 120.195.207.153:80 113.3.86.73:9000 125.235.241.132:8080 183.108.88.31:3128 124.166.250.14:3128 120.195.205.30:80 89.32.239.118:8080 120.195.207.97:80 120.195.192.90:80 120.195.198.210:80 112.137.164.232:3128 113.157.204.25:80 91.217.42.3:8080 120.195.205.189:80 120.195.206.152:80 60.11.8.34:3128 77.61.246.62:80 60.191.164.22:3128 120.195.200.240:80 216.121.113.110:80 195.114.128.12:3128 202.79.36.119:8080 120.195.196.218:80 196.2.73.54:80 211.218.126.189:3128 221.130.17.140:80 120.195.203.13:80 120.195.198.168:80 178.216.45.5:8888 221.130.17.156:80 183.223.215.153:8123 120.195.199.104:80 203.186.108.130:65208 120.195.199.39:80 210.83.222.27:8080 61.153.198.178:3128 195.235.161.28:8080 72.159.148.20:10000 120.195.203.74:80 218.24.243.70:3128 200.47.22.33:8080 217.92.214.200:8080 91.243.163.202:8080 58.253.238.242:80 182.253.123.61:3128 190.136.174.181:8080 58.133.61.3:3128 120.195.205.138:80 120.195.206.107:80 120.195.206.205:80 120.195.201.114:80 49.91.12.226:3128 120.195.202.139:80 124.206.55.189:3128 120.195.206.43:80 201.243.108.244:8080 174.133.147.243:80 213.248.54.230:80 211.138.124.132:80 122.225.56.18:8080 124.206.89.242:3128 60.15.41.77:3128 59.60.226.219:80 221.229.252.98:8080 123.57.23.114:80 202.108.50.67:80 124.193.41.251:3128 203.187.186.142:80 110.8.253.100:80 92.50.188.98:8080 173.244.217.44:3128 195.138.78.222:8080 68.53.136.149:8088 171.100.192.33:3128 120.195.205.210:80 111.180.53.176:9000 14.139.85.68:8080 211.138.124.199:80 120.195.201.174:80 202.162.198.219:8080 212.45.5.172:3128 189.213.65.108:3128 221.130.18.50:80 120.195.203.73:80 203.156.123.220:8080 221.130.007.226:80 177.223.12.121:8080 187.23.238.170:3128 212.2.133.140:8080 170.97.67.18:80 178.32.63.223:3128 120.195.197.134:80 218.201.21.176:80 217.91.27.179:8080 117.21.174.122:3128 103.11.116.46:8080 190.207.173.252:3128 120.195.194.127:80 78.36.202.149:3128 182.206.130.225:3128 200.85.121.5:8083