python 爬虫入门1 爬取代理服务器网址
刚学,只会一点正则,还只能爬1页。。以后还会加入测试
1 #coding:utf-8 2 3 import urllib 4 import urllib2 5 import re 6 7 #抓取代理服务器地址 8 Key = 1 9 url = 'http://www.xicidaili.com/nt/%s' %Key 10 #print url 11 12 user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 13 headers={'User-Agent' :user_agent} 14 15 try: 16 request = urllib2.Request(url,headers=headers) 17 response=urllib2.urlopen(request) 18 html=response.read() 19 pattern=re.compile('<td class="country".*?<td>(\d+).(\d+).(\d+).(\d+)</td>.*?<td>(\d+)</td>',re.S) 20 items=re.findall(pattern,html) 21 for item in items: 22 # if item !='HTTP'or'HTTPS': 23 print "%s.%s.%s.%s:%s" %(item[0],item[1],item[2],item[3],item[4]) 24 except urllib2.URLError,e: 25 if hasattr(e,'code'): 26 print e.code 27 if hasattr(e,'reason'): 28 print e.reason
Output
112.112.95.25:9999 113.66.236.53:9797 14.221.165.46:9797 123.121.79.213:9000 219.133.10.211:9797 113.109.248.12:9797 27.46.48.187:9797 115.183.11.158:9999 112.93.208.231:8080 113.78.254.84:9000 121.35.243.157:8080 42.157.5.154:9999 218.75.144.25:9000 113.65.8.221:9999 218.56.132.158:8080 59.59.144.135:53281 119.129.96.33:9797 115.213.60.99:53281 221.237.154.58:9797 120.86.180.173:9797 112.250.65.222:53281 27.37.22.243:9000 123.138.89.133:9999 175.171.184.36:53281 113.76.96.161:9797 183.29.130.106:9000 119.90.63.3:3128 175.171.186.171:53281 183.184.194.15:9797 218.241.234.48:8080 113.200.159.155:9999 218.6.145.11:9797 218.56.132.156:8080 223.199.175.107:808 14.221.166.140:9000 220.249.185.178:9999 122.72.18.34:80 139.224.24.26:8888 122.72.18.60:80 61.163.139.168:9797 202.120.46.180:443 122.72.18.61:80 125.45.87.12:9999 116.85.24.26:8080 222.86.191.44:8080 112.74.94.142:3128 61.163.139.168:9797 114.255.212.17:808 118.178.228.175:3128 122.72.18.35:80 101.37.79.125:3128 113.89.52.86:9999 113.118.96.132:9797 101.81.142.10:9000 61.155.164.106:3128 114.115.140.25:3128 171.37.176.140:9797 58.252.6.165:9000 61.163.39.70:9999 121.8.170.53:9797 175.174.118.141:8080 118.119.168.172:9999 171.37.143.140:9797 119.39.68.212:808 124.90.30.103:8118 59.38.61.23:9797 1.196.161.163:9999 113.116.76.212:8088 122.136.212.132:53281 203.174.112.13:3128 221.217.49.196:9000 14.29.84.50:8080 175.17.156.139:8080 175.17.174.218:9000 114.221.125.161:8118 123.139.56.238:9999 113.87.163.152:808 101.6.33.113:8123 61.155.164.112:3128 180.140.161.138:9797 221.7.49.209:53281 120.9.75.45:9999 183.184.112.78:9797 116.236.151.166:8080 119.122.2.160:9000 119.129.96.142:9797 116.52.195.113:9999 61.155.164.109:3128 112.86.248.163:8118 115.171.47.184:9000 116.30.218.76:9000 123.7.38.31:9999 218.29.111.106:9999 114.101.35.113:54214 124.89.33.75:9999 114.254.4.208:9797 183.54.192.211:9797 218.17.8.110:8118 183.30.201.123:9797 119.123.244.95:9000 ***Repl Closed***