1 import math 2 import os 3 import random 4 import re 5 import urllib.request 6 import ssl 7 8 ssl._create_default_https_context = ssl._create_unverified_context 9 10 ##获取第几页的HTML内容 11 def getContent(pageNum): 12 13 print("马上开始爬第%d页数据"%pageNum) 14 headers = { 15 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", 16 'Connection': 'keep-alive' 17 } 18 19 #https://pic.netbian.com/4kmeinv/index_6.html 20 url = "https://pic.netbian.com/4kmeinv/index_%d.html"%pageNum 21 if(pageNum == 1): 22 url = "https://pic.netbian.com/4kmeinv/index.html" 23 24 25 ##请求对象(URL + 请求头) 26 req = urllib.request.Request(url, headers=headers) 27 28 page = urllib.request.urlopen(req).read() 29 page = page.decode("GBK") 30 print("完成第%d页数据爬虫" %pageNum) 31 return page 32 33 ##从内容页面中 获取 图片的URL地址 34 def getPhotoUrl(content): 35 pattern = re.compile(r'<img src="(.+?)" alt=".*?" />') 36 res = re.findall(pattern, content) 37 # res = "https://pic.netbian.com/4kmeinv/"+res 38 list = [] 39 for url in res: 40 newUrl = "https://pic.netbian.com" + url 41 list.append(newUrl) 42 return list 43 44 def downLoadAllPhoto(list,page): 45 ##创建目录 46 path = "E:\李泽成\素材\人像2" 47 if os.path.isdir(path) == False: 48 os.mkdir(path) 49 50 for (index,url) in enumerate(list): 51 filePath = "%s/%d_%d"%(path,page,index) 52 print("正在下载第%d:%s图片"%(index,url)) 53 downloadFile(url,filePath) 54 55 56 ##下周一张图片 57 def downloadFile(url,path): 58 ##后缀名 59 ext = url.split(".")[-1] 60 path = path + "." + ext ##aaa.png 61 62 #设置请求头 63 opener = urllib.request.build_opener() 64 opener.addheaders=[("User-Agent", "Mozilla/5.0")] 65 urllib.request.install_opener(opener) 66 ##下载 67 urllib.request.urlretrieve(url, path) 68 69 70 71 if __name__ == '__main__': 72 for i in range(1,140): 73 page = getContent(i) 74 photoList = getPhotoUrl(page) 75 downLoadAllPhoto(photoList,i)