爬取图片(二)
源码:
1 import requests 2 from lxml import etree 3 import os 4 5 6 # 获取图集地址 7 def get_url(page,headers): 8 url = 'http://www.mzitu.com/page/{}/'.format(page) 9 response = requests.get(url,headers=headers) 10 html_ele = etree.HTML(response.text) 11 ele_list = html_ele.xpath('//ul[@id="pins"]/li') 12 url_tuple_list = [] 13 for ele in ele_list: 14 url = ele.xpath('./span/a/@href')[0] 15 name = ele.xpath('./span/a')[0].text 16 url_tuple = (url,name) 17 url_tuple_list.append(url_tuple) 18 return url_tuple_list 19 20 21 # 下载图片 22 def get_pics(url,headers,name): 23 # 创建文件夹 24 dirs_name = 'www.mzitu.com/' + name 25 if not os.path.exists(dirs_name): 26 os.makedirs(dirs_name) 27 28 # 获取最大图片页数 29 response = requests.get(url,headers=headers) 30 html_ele = etree.HTML(response.text) 31 max_page = html_ele.xpath('//div[@class="pagenavi"]/a/span')[-2].text 32 # print(type(max_page)) 33 # 存储图片 34 for page in range(1,int(max_page)+1): 35 if page < 10: 36 url_page = url + '/0' +str(page) 37 else: 38 url_page = url + '/' + str(page) 39 # print(url_page) 40 response = requests.get(url_page,headers=headers) 41 html_ele = etree.HTML(response.text) 42 pic_url = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0] 43 # print(pic_url) 44 pic_bytes = requests.get(pic_url,headers=headers) 45 filename = dirs_name + '/' + pic_url.split('/')[-1] 46 if not os.path.exists(filename): 47 with open(filename, 'wb') as f: 48 f.write(pic_bytes.content) 49 print(filename) 50 51 52 if __name__ == '__main__': 53 headers = { 54 "Referer": "http://www.mzitu.com", 55 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", 56 } 57 for page in range(1,3): 58 url_tuple_list = get_url(page,headers) 59 for url,name in url_tuple_list: 60 get_pics(url,headers,name)