煎蛋网妹子图爬虫总结
这次是只用字符串查找的方式来找网页中图片链接的
1 #!/usr/bin/python 2 #coding:utf-8 3 import urllib.request 4 import os 5 import time 6 import random 7 8 def url_open(url): 9 # header = {} 10 # header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0' 11 req=urllib.request.Request(url) 12 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0') 13 14 proxylist=['111.192.44.204:9000','222.82.222.242:9999','124.202.247.110:8080'] 15 proxy=random.choice(proxylist) 16 proxyhandler=urllib.request.ProxyHandler({'htttp':proxy}) 17 opener=urllib.request.build_opener(proxyhandler) 18 urllib.request.install_opener(opener) request模块中建立opener 19 response = urllib.request.urlopen(url) 20 html=response.read() 21 return html 22 23 def find_img(url): 24 html=url_open(url).decode('utf-8') 从utf-8解码成unicode 反之encode(utf-8) 从unicode编码成utf-8 25 img_addrs=[] 26 27 a=html.find('img src=') 找链接起始位置a,结束位置b,然后b-a 切片 28 while a !=-1: 29 b=html.find('.jpg',a,a+100) 30 if b!=-1: 31 img_addrs.append('http:'+html[a+9:b+4]) 将链接放到list中 32 else: 33 b=a+9 34 print('A') 35 a=html.find('img src=',b) 36 for each in img_addrs: 对list进行itrate 37 print (each) 38 return img_addrs 39 40 def save_img(folder,img_addrs): 保存图片 41 print(img_addrs) 42 for each in img_addrs: 43 # filename=each.split('/')[-1] 44 global j 45 j += 1 46 with open(str(j)+'.jpg','wb') as f: 47 img=url_open(each) 48 f.write(img) 49 50 def download_mm(folder="d://xx22"): 主函数 51 os.mkdir(folder) 52 os.chdir(folder) 53 global j 54 j=0 55 56 url="http://jiandan.net/ooxx/" 57 page_num=26 58 for i in range(page_num): 59 page_num-=1 60 page_url=url+'page-'+str(page_num)+'#comments' 61 print(page_num) 62 63 img_addrs=find_img(page_url) 64 save_img(folder,img_addrs) 65 time.sleep(1) 66 67 if __name__=="__main__": 68 download_mm()