python爬煎蛋妹子图
1 # python3 2 # jiandan meizi tu 3 import urllib 4 import urllib.request as req 5 import os 6 import time 7 import random 8 9 10 def url_open(url): 11 req1 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.0'}) 12 req2 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.1'}) 13 req3 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.5'}) 14 req4 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.1'}) 15 16 req_list = [req1, req2,req3, req4] 17 response = urllib.request.urlopen(random.choice(req_list)) 18 html = response.read() 19 # print ('url_open done!') 20 return html 21 22 def url_open2(url): 23 req1 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.0'}) 24 req2 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.1'}) 25 req3 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/4.5'}) 26 req4 = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.1'}) 27 req_list = [req1, req2,req3, req4] 28 29 ip_list = ['117.135.251.136:82'] 30 ip = random.choice(ip_list) 31 print (ip) 32 33 proxy = req.ProxyHandler({'http': ip}) 34 # auth = req.HTTPBasicAuthHandler() 35 opener = req.build_opener(proxy, req.HTTPHandler) 36 req.install_opener(opener) 37 conn = req.urlopen(random.choice(req_list)) 38 return_str = conn.read() 39 return return_str 40 41 def get_current_page(url): 42 html = url_open2(url).decode('utf-8') 43 a = html.find('current-comment-page') + 23 44 b = html.find(']',a) 45 return html[a:b] 46 47 def find_imgs(url): 48 html = url_open2(url).decode('utf-8') 49 img_addrs = [] 50 a = html.find('img src="http') 51 while a != -1: 52 b = html.find('.jpg',a, a+255) 53 if b != -1: 54 img_addrs.append(html[a+9:b+4]) 55 else: 56 b = a + 13 57 a = html.find('img src="http', b) 58 return img_addrs 59 60 def save_imgs(folder,img_addrs): 61 for each in img_addrs: 62 filename = each.split('/')[-1] 63 with open(filename,'wb') as f: 64 img = url_open2(each) 65 f.write(img) 66 67 68 def download_mm(folder = 'xx',pages = 300): 69 # os.mkdir(folder) 70 os.chdir(folder) 71 72 url = 'http://jandan.net/ooxx/' 73 current_page_num = int(get_current_page(url)) 74 for i in range(pages): 75 print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),'current_page_num', current_page_num) 76 if i%3 == 0: 77 print (time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()),"sleep 2 seconds...") 78 time.sleep(2) 79 current_page_num -= 1 80 page_url = url + 'page-' + str(current_page_num) + '#comments' 81 img_addrs = find_imgs(page_url) 82 save_imgs(folder, img_addrs) 83 84 if __name__ == '__main__': 85 download_mm()