爬取mzitu第一版
1 import os 2 import re 3 from time import perf_counter 4 from functools import wraps 5 6 import requests 7 from scrapy import Selector 8 9 def timer(func): 10 @wraps(func) 11 def wrapper(*args, **kwargs): 12 start_time = perf_counter() 13 result = func(*args, **kwargs) 14 end_time = perf_counter() 15 cls_name = func.__name__ 16 fmt = '{cls_name} {args} spend time: {time:.5f}' 17 print(fmt.format(cls_name=cls_name, args=args, time=end_time - start_time)) 18 return (result) 19 return (wrapper) 20 21 def get_content_css(url): 22 req = requests.get(url) 23 content = req.content.decode('utf-8') 24 selector = Selector(text=content) 25 return (selector) 26 27 ##获取页面中的图片组名和链接 28 def get_one_page_dict(url): 29 selector = get_content_css(url) 30 for num in selector.css('ul#pins li'): 31 yield { 32 'url' : num.css('a::attr(href)').extract_first(), 33 'title' : num.css('a>img::attr(alt)').extract_first(), 34 } 35 36 ##获取当前类型所有的图片组 37 def get_all_page_dict(url,type_page_num): 38 dicts = [] 39 for page_num in range(1,type_page_num + 1): 40 one_page_dict = get_one_page_dict(url %(page_num)) 41 for page_dict in one_page_dict: 42 dicts.append(page_dict) 43 return (dicts) 44 45 ##通过图片组链接获取的最大页码 46 def get_group_max_page_num(group_url): 47 selector = get_content_css(group_url) 48 group_page_num = selector.css('.pagenavi a span::text').extract()[-2] 49 return (group_page_num) 50 51 ##获取图片的下载链接 52 def get_img_download_url(image_url): 53 selector = get_content_css(image_url) 54 img_download_url = selector.css('.main-image img::attr(src)').extract()[0] 55 return (img_download_url) 56 57 ##判断文件夹是否存在,不存在则创建 58 def is_dir(dir_name): 59 if os.path.exists(dir_name): 60 return (True) 61 else: 62 os.mkdir(dir_name) 63 return (False) 64 65 ##过滤掉特殊字符及空格 66 def filter_char(str): 67 r = '["*/:;?\|]|[<b>]|[</b>]' 68 return(re.sub(r,'',str).replace(' ','')) 69 70 ##获取当前类型的最大页码 71 def get_mz_type_max_page_num(url): 72 selector = get_content_css(url) 73 type_page_num =selector.css('div.nav-links a::text').extract()[-2] 74 return (type_page_num) 75 76 ##保存单张图片 77 @timer 78 def save_image(dir_name, image_download_url): 79 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 80 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36','Connection': 'Keep-Alive', 81 'Referer': "http://www.mzitu.com/"} 82 req = requests.get(image_download_url, headers=headers) 83 con = req.content 84 filename = dir_name + image_download_url.rsplit('/', 1)[-1] 85 save_img = os.path.join(dir_name, filename) 86 with open(save_img, 'wb') as f: 87 f.write(con) 88 89 ##循环并保存整组的图片 90 def loop_and_save_img(group_url,group_title,group_max_page_num): 91 for page_num in range(1, group_max_page_num + 1): 92 image_url = get_img_download_url(group_url + '/' + str(page_num)) 93 save_image(group_title, image_url) 94 95 ##获取url 96 def get_url(): 97 global mz_type_default 98 mz_type_default = {'1':['性感妹子','xinggan/'],'2':['日本妹子','japan/'],'3':['台湾妹子','taiwan/'],'4':['清纯妹子','mm/'],'5':['妹子自拍','zipai/'],'6':['我全都要','']} 99 for mz_type_len in range(1,mz_type_default.__len__() + 1): 100 print(mz_type_len,' : ',mz_type_default.get(str(mz_type_len))[0]) 101 while True: 102 global mz_type_in 103 mz_type_in = str(input("选择妹子类型(1-6数字)(5还没有实现):")) 104 if re.compile('^[1-6]$').match(mz_type_in): 105 mz_type = mz_type_default.get(mz_type_in)[1] 106 break; 107 url = 'http://www.mzitu.com/' + mz_type 108 if mz_type == 'zipai/': 109 url = 'http://www.mzitu.com/' + mz_type + 'comment-page-{page_num}/#comments' 110 else: 111 url = 'http://www.mzitu.com/' +mz_type + 'page/%s/' 112 return (url) 113 114 ##神奇的开始 115 def get_something_amazing(): 116 url = get_url() 117 type_page_num = int(get_mz_type_max_page_num(url % (1))) 118 group_lists = get_all_page_dict(url, type_page_num) 119 print(mz_type_default[mz_type_in][0], ' 共有 ', type_page_num, ' 页 ', group_lists.__len__(), ' 组图片') 120 for group_list in group_lists: 121 group_url = group_list['url'] 122 group_title = filter_char(group_list['title']) 123 print(group_url, group_title) 124 if is_dir(group_title): 125 print(group_title, ' 已存在,跳过') 126 else: 127 group_max_page_num = int(get_group_max_page_num(group_url)) 128 loop_and_save_img(group_url, group_title, group_max_page_num) 129 130 if __name__ == "__main__": 131 start = perf_counter() 132 get_something_amazing() 133 end = perf_counter() 134 print(format('end', '*^100')) 135 print('download all images cost time:{:.3f}'.format(end - start))