说明:
1. 有很多细节需要注意!
2. str是保留字,不要作为变量名
3. 保存为txt报错,encoding=utf-8
4. 403错误,添加headers的方法
5. 正则match只能从开头匹配,search可以中间匹配,返回对象,可以用span()查看匹配的字符索引,不过还是用findall全部查找 比较方便,直接返回一个数组。
6.
1 import urllib.request 2 import re 3 4 # <span class="current-comment-page">[1292]</span> 5 # <img src="//ww2.sinaimg.cn/mw600/7064b124jw1enncg4zsmij20dw0ijgn0.jpg" style="max-width: 480px; max-height: 750px;"> 6 # url = "http://placekitten.com/300/300" 7 # url = "http://placehold.it/300/300" 8 # url = "http://www.tuwenclub.com" 9 10 url = "http://jandan.net/ooxx" 11 response = urllib.request.urlopen(url) 12 html = response.read().decode("utf-8") 13 14 # html = response.read().decode("utf-8") 15 # html = response.read().decode("gbk") 16 # 17 # print(html[0:1000]) 18 # page = html.find("current-comment-page") 19 # pageNum = html[page+23:page+27] 20 # imglist = [] 21 reImg = r"//[0-9a-z]+\.sinaimg\.cn.+?\.jpg" 22 strs = "//ww2.sinaimg.cn/mw600/7064b124jw1enncg4zsmij20dw0ijgn0.jpg" 23 24 imgs = re.findall(reImg, html) 25 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} 26 # print(img1) 27 for i in range(0,10): 28 urlImg = "http:" + imgs[i] 29 req = urllib.request.Request(url=urlImg, headers=headers) 30 response = urllib.request.urlopen(req) 31 img = open(str(i)+".jpg","wb") 32 img.write(response.read()) 33 img.close()