Python-爬取妹子图(单线程和多线程版本)
一、参考文章
上述文章中的代码讲述的非常清楚,我的基本能思路也是这样,本篇文章中的代码仅仅做了一些异常处理和一些日志显示优化工作,写此文章主要是当做笔记,方便以后查阅,修改的地方如下:
1、异常处理下面在代码中会单独标红
2、多线程版使用了multiprocessing这个库,需要在main函数开始调用freeze_support(),防止打包成exe之后,运行时创建线程失败
3、多线程版本加了一个命令行自定义线程个数功能
二、单线程版本
1 #coding=utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import os 5 6 all_url = 'http://www.mzitu.com' 7 8 9 #http请求头 10 Hostreferer = { 11 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 12 'Referer':'http://www.mzitu.com' 13 } 14 Picreferer = { 15 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 16 'Referer':'http://i.meizitu.net' 17 } 18 #此请求头破解盗链 19 20 start_html = requests.get(all_url, headers = Hostreferer) 21 22 #保存地址 23 path = os.getcwd() + '/mzitu/' 24 25 #找寻最大页数 26 soup = BeautifulSoup(start_html.text, "html.parser") 27 page = soup.find_all('a', class_='page-numbers') 28 max_page = page[-2].text 29 30 31 same_url = 'http://www.mzitu.com/page/' 32 for n in range(0, int(max_page)+1):#遍历页面数 33 ul = same_url+str(n) 34 start_html = requests.get(ul, headers = Hostreferer) 35 soup = BeautifulSoup(start_html.text, "html.parser") 36 all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank') 37 for a in all_a:#每个页面包含的妹子数 38 title = a.get_text() #提取文本 39 if(title != ''): 40 print("准备扒取:" + title) 41 42 #win不能创建带?的目录 43 if(os.path.exists(path+title.strip().replace('?', ''))): 44 #print('目录已存在') 45 flag = 1 46 else: 47 os.makedirs(path+title.strip().replace('?', '')) 48 flag = 0 49 os.chdir(path + title.strip().replace('?', '')) 50 href = a['href'] 51 html = requests.get(href, headers = Hostreferer) 52 mess = BeautifulSoup(html.text, "html.parser") 53 pic_max = mess.find_all('span') 54 pic_max = pic_max[10].text #最大页数 55 if(flag == 1 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)): 56 print('已经保存完毕,跳过') 57 continue 58 for num in range(1, int(pic_max) + 1):#每个妹子的所有照片 59 pic = href+'/'+str(num) 60 html = requests.get(pic, headers = Hostreferer) 61 mess = BeautifulSoup(html.text, "html.parser") 62 pic_url = mess.find('img', alt = title) 63 64 if 'src' not in pic_url.attrs:#有些pic_url标签没有src这个属性,导致操作异常,在次进行过滤 65 continue 66 print(pic_url['src']) 67 #exit(0) 68 html = requests.get(pic_url['src'],headers = Picreferer) 69 file_name = pic_url['src'].split(r'/')[-1] 70 f = open(file_name, 'wb') 71 f.write(html.content) 72 f.close() 73 print('完成') 74 print('第',n,'页完成')
三、多线程版本
1 #coding=utf-8 2 import requests 3 from bs4 import BeautifulSoup 4 import os 5 from multiprocessing import Pool 6 from multiprocessing import freeze_support 7 import sys 8 9 header = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36', 11 'Referer':'http://www.mzitu.com' 12 } 13 Picreferer = { 14 'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 15 'Referer':'http://i.meizitu.net' 16 } 17 18 def find_MaxPage(): 19 all_url = 'http://www.mzitu.com' 20 start_html = requests.get(all_url, headers = header) 21 #找寻最大妹子页面数 22 soup = BeautifulSoup(start_html.text, "html.parser") 23 page = soup.find_all('a', class_ = 'page-numbers') 24 max_page = page[-2].text 25 return max_page 26 27 def Download(href, title, path): 28 html = requests.get(href, headers = header) 29 soup = BeautifulSoup(html.text, 'html.parser') 30 pic_max = soup.find_all('span') 31 pic_max = pic_max[10].text # 最大页数 32 if(os.path.exists(path+title.strip().replace('?', '')) 33 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)): 34 print('妹子已待命,继续准备下一个妹子' + title) 35 return 1 36 print(f"发现妹子资源{pic_max}个,准备中:" + title) 37 os.makedirs(path + title.strip().replace('?', '')) 38 os.chdir(path + title.strip().replace('?', '')) 39 for num in range(1, int(pic_max) + 1): 40 pic = href + '/' + str(num) 41 html = requests.get(pic, headers = header) 42 mess = BeautifulSoup(html.text, "html.parser") 43 pic_url = mess.find('img', alt = title) 44 if 'src' not in pic_url.attrs:#有些pic_url标签没有src属性,导致操作异常,在次进行过滤 45 continue 46 print(f"{title}:{pic_url['src']}") 47 html = requests.get(pic_url['src'], headers = header) 48 file_name = pic_url['src'].split(r'/')[-1] 49 f = open(file_name,'wb') 50 f.write(html.content) 51 f.close() 52 print('妹子已就绪,客官请慢用:' + title) 53 54 if __name__ == '__main__': 55 freeze_support()#防止打包后 运行exe创建进程失败 56 57 #线程池中线程数 58 count = 1 59 if len(sys.argv) >=2: 60 count = int(sys.argv[1]) 61 62 pool = Pool(count) 63 print(f'初始化下载线程个数${count}') 64 65 # http请求头 66 path = os.getcwd() + '/mzitu_mutil/' 67 max_page = find_MaxPage() #获取最大页数 即生成的文件夹数量 68 print(f'捕获{max_page}页妹子,请耐心等待下载完成') 69 same_url = 'http://www.mzitu.com/page/' 70 71 for n in range(1, int(max_page) + 1): 72 each_url = same_url + str(n) 73 start_html = requests.get(each_url, headers = header)#请求一页中的所有妹子 74 soup = BeautifulSoup(start_html.text, "html.parser") 75 all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank') 76 for a in all_a:#遍历每一页中的妹子 77 title = a.get_text() # 提取文本 78 if (title != ''): 79 href = a['href']#请求妹子的所有图集 80 pool.apply_async(Download, args = (href, title, path)) 81 pool.close() 82 pool.join() 83 print('所有妹子已就绪,客官请慢用')
四、资源下载
资源下载地址:Python爬取妹子图-单线程和多线程版本
转载声明:本站文章无特别说明,皆为原创,版权所有,转载请注明:朝十晚八