python----爬虫---爬取图片万余张,应有尽有(分门别类)--第二版
#
# date:2021/01/16
# author:eihouwang
# 目标网址:http://www.netbian.com/
# 开发环境:pycharm2020.3.2、Python3.8
# 用到的模块(库):os,re,requests,urllib,time
# 整体思路:
# 1、获取主界面各个分类标签,并生成对应的文件夹 比如分类标签:rili,dongman,fengjing等
# 2、进入单个标签单独一页,获取单页图片网页列表 比如松林公路图片,帅气美女图片,美丽冬天图片等图片的网址(整个网址图片并不是高清图片)
# 3、进入单页图片网页列表中的一个网页,获取实际1920*1080像素的图片网址并进行下载
# 4、保存图片到本地
# 5、完成单标签图片收集
# 6、完成多标签图片收集
# 注意:此爬虫未用多线程,爬取时间较长(几个小时),运行之前最好没有代码中存在的文件夹,让代码自行创建,否则可能出现分类错误
# 本次选取的分类下载的图片都是不需要登录的,需要登录的没有收集
import os import re import time import requests from urllib.parse import urljoin # 设置全局变量 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0"} # 访问网页,获取反馈 def get_html(url): r = requests.get(url, headers=headers) if r.status_code == 200: return r.text return None # 获取标签列表和创建对应文件夹(返回已经排序的标签网址列表和文件夹列表) def get_lable_list(url): html = get_html(url) result = re.findall('/em(.*?)LOL', html) result2 = re.findall('"/(.*?)/"', result[0]) lable_list = [] dirnames = [] for i in result2: urlnew = urljoin(url, i) lable_list.append(urlnew) if "/" in i: i = i.replace("/", "\\") # print(i) path = "E:\\czxt\\" + i + "\\" # print(path) try: os.makedirs(path) # print("创建{}成功".format(path)) except: pass dirnames.append(path) # print(sorted(lable_list));print(sorted(dirnames)) return sorted(lable_list), sorted(dirnames) # 获取单个标签网页列表 def get_onelable_list(url): html = get_html(url) onelable_list = [] try: totalpages = int(re.findall('/span(.*?)\>(\d+)\</a(.*?)class', html)[0][1]) except: totalpages = 1 print(totalpages) for i in range(1, totalpages + 1): if i == 1: urlnew = url else: urlnew = url + '/index_' + str(i) + '.htm' onelable_list.append(urlnew) print(urlnew) return onelable_list # 获取单页图片 def get_one_page(url, k=0, dirnames=[]): baseurl = "http://www.netbian.com/" html = get_html(url) result = re.findall('desk/(.*?).htm"', html) n = 0 for i in result: urlnew = baseurl + "/desk/" + i + "-1920x1080.htm" htmlnew = get_html(urlnew) try: pattern = re.compile('left(.*?)href="(.*?)"', re.S) picurl = pattern.findall(htmlnew)[0][1] path1 = picurl.split('/')[-1] path = dirnames[k] + path1 if os.path.exists(path): print('已下载!') continue # print(picurl) r = requests.get(picurl) store_one_pic(path, r.content) n += 1 print(path, end="--->") print('第({})张图片下载完成'.format(n)) except: pass print('本页共计下载有效图片({})张'.format(n)) def store_one_pic(path, content): with open(path, 'wb') as f: f.write(content) f.close() def main(): url = "http://www.netbian.com/" lable_list, dirnames = get_lable_list(url) print('---------------分类标签共计{}个,即将分标签下载------------'.format(len(lable_list))) time.sleep(3) num = 0 l = 0 for k in lable_list: num += 1 onelable_list = get_onelable_list(k) print('获取第({})个标签,共({})页'.format(num, len(onelable_list))) num2 = 0 for m in onelable_list: num2 += 1 print('正在获取第({})页数据'.format(num2)) get_one_page(m, k=l, dirnames=dirnames) l += 1 main()