爬取图片(二)

源码:

 1 import requests
 2 from lxml import etree
 3 import os
 4 
 5 
 6 # 获取图集地址
 7 def get_url(page,headers):
 8     url = 'http://www.mzitu.com/page/{}/'.format(page)
 9     response = requests.get(url,headers=headers)
10     html_ele = etree.HTML(response.text)
11     ele_list = html_ele.xpath('//ul[@id="pins"]/li')
12     url_tuple_list = []
13     for ele in ele_list:
14         url = ele.xpath('./span/a/@href')[0]
15         name = ele.xpath('./span/a')[0].text
16         url_tuple = (url,name)
17         url_tuple_list.append(url_tuple)
18     return url_tuple_list
19 
20 
21 # 下载图片
22 def get_pics(url,headers,name):
23     # 创建文件夹
24     dirs_name = 'www.mzitu.com/' + name
25     if not os.path.exists(dirs_name):
26         os.makedirs(dirs_name)
27 
28     # 获取最大图片页数
29     response = requests.get(url,headers=headers)
30     html_ele = etree.HTML(response.text)
31     max_page = html_ele.xpath('//div[@class="pagenavi"]/a/span')[-2].text
32     # print(type(max_page))
33     # 存储图片
34     for page in range(1,int(max_page)+1):
35         if page < 10:
36             url_page = url + '/0' +str(page)
37         else:
38             url_page = url + '/' + str(page)
39         # print(url_page)
40         response = requests.get(url_page,headers=headers)
41         html_ele = etree.HTML(response.text)
42         pic_url = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
43         # print(pic_url)
44         pic_bytes = requests.get(pic_url,headers=headers)
45         filename = dirs_name + '/' + pic_url.split('/')[-1]
46         if not os.path.exists(filename):
47             with open(filename, 'wb') as f:
48                 f.write(pic_bytes.content)
49             print(filename)
50 
51 
52 if __name__ == '__main__':
53     headers = {
54         "Referer": "http://www.mzitu.com",
55         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
56     }
57     for page in range(1,3):
58         url_tuple_list = get_url(page,headers)
59         for url,name in url_tuple_list:
60             get_pics(url,headers,name)

 

posted @ 2018-08-19 15:07  _积木城池  阅读(173)  评论(0编辑  收藏  举报