写了个小爬虫,为何用上代理ip总是出现错误。
1 import urllib.request 2 import re 3 import os 4 import random 5 import threading 6 7 def url_open(url): #在第8到第12行,总是无法正常运行,代理Ip是从网上免费代理ip获取的。 8 #ips = ['117.136.234.12:80', '218.189.26.20:8080','202.194.101.150:80','180.166.112.47:8888'] 9 10 #proxy = urllib.request.ProxyHandler({'http':random.choice(ips)})#{'http':'124.202.174.66:8118'} 11 #opener = urllib.request.build_opener(proxy) 12 #opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36')] 13 14 #urllib.request.install_opener(opener) 15 16 req = urllib.request.Request(url) 17 18 req.add_header('User-Agent','Mozilla/5.0') 19 urlobject = urllib.request.urlopen(req) 20 response = urlobject.read() 21 return response 22 23 def find_page(html): 24 s2 = r'\[\d{4}\]' 25 m = re.search(s2, html) 26 page = m.group() 27 print("find_page") 28 return page 29 30 def find_page_link(html): 31 s = r'http://ww[0-9].sinaimg.cn/mw600/\w+.jpg' 32 m = re.findall(s, html) 33 return m 34 35 36 def save_page(jpg): 37 for file in jpg: 38 data = url_open(file) 39 #print("wwwwwwwwww") 40 name = "E:\\作业\\j_d\\"+file.split('/')[-1] 41 with open(name, 'wb') as f: 42 f.write(data) 43 44 45 def down_jpg(dir_name='E:\作业\j_d', page=10, pages=10): 46 #os.mkdir(dir_name) 47 os.chdir(dir_name) 48 #red = url_open('http://jandan.net/ooxx') 49 #print(type(red)) 50 #red = red.decode('utf-8') 51 52 #page = find_page(red) 53 #page = int(page[1:-1]) 54 #page = 1333 55 for i in range(pages): 56 page += 1 57 url = 'http://jandan.net/ooxx/page-'+str(page)+'#comments' 58 print(url) 59 data = url_open(url) 60 data = data.decode('utf-8') 61 print("dddddddddddddd") 62 page_list = find_page_link(data) 63 #print("sssssssssssssss") 64 save_page(page_list) 65 66 67 if __name__ == '__main__': 68 p = threading.Thread(target=down_jpg,args=('E:\作业\j_d',1555,10)) 69 c = threading.Thread(target=down_jpg,args=('E:\作业\j_d',1024,10)) 70 #down_jpg() 71 p.start() 72 c.start() 73 74 p.join() 75 c.join()