1 import urllib.request 2 import re 3 import time 4 import random 5 6 7 def getResponse(url): 8 req = urllib.request.Request(url) 9 req.add_header("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36") 10 11 resp = urllib.request.urlopen(req) 12 return resp.read() 13 14 def getHtml(url,charSet = "utf-8"): 15 return getResponse(url).decode(charSet) 16 17 18 def createOpenner(ipList): 19 m_proxy = urllib.request.ProxyHandler({"http":random.choice(ipList)}) 20 openner = urllib.request.build_opener(m_proxy) 21 urllib.request.install_opener(openner) 22 23 # 24 def getProxyList(url,iPage = 10): 25 ipList = [] 26 for i in range(1,iPage+1): 27 html_str = getHtml(url+str(i)) 28 ip = re.findall("IP\">((?:\d{1,3}\.){3}(?:\d{1,3}))(?:[\s\S]{0,50})\"PORT\">(\d{2,4})", html_str) 29 for addr in ip: 30 ipList.append(addr[0]+":"+addr[1]) 31 time.sleep(2) 32 33 return ipList 34 35 36 ipList = getProxyList("http://www.kuaidaili.com/free/outha/",1)#爬取1页 37 38 39 print(ipList)
快代理:"IP\">((?:\d{1,3}\.){3}(?:\d{1,3}))(?:[\s\S]*?)\"PORT\">(\d{2,4})" #下划线处原来是[\s\S]*,不带问号,后果是默认的贪婪模式。
只能取到一个地址,加上问号开启非贪婪模式。
西刺:"((?:\d{1,3}\.){3}(?:\d{1,3}))(?:[\s\S]*?)(\d{2,4})"
返回ipList列表:['46.101.3.126:8118', '177.207.234.14:80', '113.255.49.49:80', '52.59.18.222:80', '36.81.0.138:8080', '54.165.24.194:80', '115.252.35.104:8080', '136.169.58.21:8080', '51.254.106.65:80', '178.238.213.246:8080', '49.205.212.243:8080', '137.135.166.225:8131', '168.63.24.174:8138', '179.243.46.131:8080', '186.90.160.245:8080']