1 import os
2 import re
3 from time import perf_counter
4 from functools import wraps
5
6 import requests
7 from scrapy import Selector
8
9 def timer(func):
10 @wraps(func)
11 def wrapper(*args, **kwargs):
12 start_time = perf_counter()
13 result = func(*args, **kwargs)
14 end_time = perf_counter()
15 cls_name = func.__name__
16 fmt = '{cls_name} {args} spend time: {time:.5f}'
17 print(fmt.format(cls_name=cls_name, args=args, time=end_time - start_time))
18 return (result)
19 return (wrapper)
20
21 def get_content_css(url):
22 req = requests.get(url)
23 content = req.content.decode('utf-8')
24 selector = Selector(text=content)
25 return (selector)
26
27 ##获取页面中的图片组名和链接
28 def get_one_page_dict(url):
29 selector = get_content_css(url)
30 for num in selector.css('ul#pins li'):
31 yield {
32 'url' : num.css('a::attr(href)').extract_first(),
33 'title' : num.css('a>img::attr(alt)').extract_first(),
34 }
35
36 ##获取当前类型所有的图片组
37 def get_all_page_dict(url,type_page_num):
38 dicts = []
39 for page_num in range(1,type_page_num + 1):
40 one_page_dict = get_one_page_dict(url %(page_num))
41 for page_dict in one_page_dict:
42 dicts.append(page_dict)
43 return (dicts)
44
45 ##通过图片组链接获取的最大页码
46 def get_group_max_page_num(group_url):
47 selector = get_content_css(group_url)
48 group_page_num = selector.css('.pagenavi a span::text').extract()[-2]
49 return (group_page_num)
50
51 ##获取图片的下载链接
52 def get_img_download_url(image_url):
53 selector = get_content_css(image_url)
54 img_download_url = selector.css('.main-image img::attr(src)').extract()[0]
55 return (img_download_url)
56
57 ##判断文件夹是否存在,不存在则创建
58 def is_dir(dir_name):
59 if os.path.exists(dir_name):
60 return (True)
61 else:
62 os.mkdir(dir_name)
63 return (False)
64
65 ##过滤掉特殊字符及空格
66 def filter_char(str):
67 r = '["*/:;?\|]|[<b>]|[</b>]'
68 return(re.sub(r,'',str).replace(' ',''))
69
70 ##获取当前类型的最大页码
71 def get_mz_type_max_page_num(url):
72 selector = get_content_css(url)
73 type_page_num =selector.css('div.nav-links a::text').extract()[-2]
74 return (type_page_num)
75
76 ##保存单张图片
77 @timer
78 def save_image(dir_name, image_download_url):
79 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
80 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36','Connection': 'Keep-Alive',
81 'Referer': "http://www.mzitu.com/"}
82 req = requests.get(image_download_url, headers=headers)
83 con = req.content
84 filename = dir_name + image_download_url.rsplit('/', 1)[-1]
85 save_img = os.path.join(dir_name, filename)
86 with open(save_img, 'wb') as f:
87 f.write(con)
88
89 ##循环并保存整组的图片
90 def loop_and_save_img(group_url,group_title,group_max_page_num):
91 for page_num in range(1, group_max_page_num + 1):
92 image_url = get_img_download_url(group_url + '/' + str(page_num))
93 save_image(group_title, image_url)
94
95 ##获取url
96 def get_url():
97 global mz_type_default
98 mz_type_default = {'1':['性感妹子','xinggan/'],'2':['日本妹子','japan/'],'3':['台湾妹子','taiwan/'],'4':['清纯妹子','mm/'],'5':['妹子自拍','zipai/'],'6':['我全都要','']}
99 for mz_type_len in range(1,mz_type_default.__len__() + 1):
100 print(mz_type_len,' : ',mz_type_default.get(str(mz_type_len))[0])
101 while True:
102 global mz_type_in
103 mz_type_in = str(input("选择妹子类型(1-6数字)(5还没有实现):"))
104 if re.compile('^[1-6]$').match(mz_type_in):
105 mz_type = mz_type_default.get(mz_type_in)[1]
106 break;
107 url = 'http://www.mzitu.com/' + mz_type
108 if mz_type == 'zipai/':
109 url = 'http://www.mzitu.com/' + mz_type + 'comment-page-{page_num}/#comments'
110 else:
111 url = 'http://www.mzitu.com/' +mz_type + 'page/%s/'
112 return (url)
113
114 ##神奇的开始
115 def get_something_amazing():
116 url = get_url()
117 type_page_num = int(get_mz_type_max_page_num(url % (1)))
118 group_lists = get_all_page_dict(url, type_page_num)
119 print(mz_type_default[mz_type_in][0], ' 共有 ', type_page_num, ' 页 ', group_lists.__len__(), ' 组图片')
120 for group_list in group_lists:
121 group_url = group_list['url']
122 group_title = filter_char(group_list['title'])
123 print(group_url, group_title)
124 if is_dir(group_title):
125 print(group_title, ' 已存在,跳过')
126 else:
127 group_max_page_num = int(get_group_max_page_num(group_url))
128 loop_and_save_img(group_url, group_title, group_max_page_num)
129
130 if __name__ == "__main__":
131 start = perf_counter()
132 get_something_amazing()
133 end = perf_counter()
134 print(format('end', '*^100'))
135 print('download all images cost time:{:.3f}'.format(end - start))