python---爬取图片数千张(分门别类)

目标网址:http://www.netbian.com/

代码(待优化)如下:

#
# date:2021/1/14
# author:eihouwang
# 1.获取分类标签,形成标签网址列表
# 2.单个标签网址单页图片
# 3.单个标签全部页图片
# 4.获取全部标签,全部图片

import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import os
import time

url = "http://www.netbian.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 "
"Safari/537.36 SE 2.X MetaSr 1.0"}
dirnames=[]
dirs = os.listdir('E:\\czxt\\')
if len(dirs)>0:
for x in dirs:
dirnames.append('E:\czxt\\'+x+'\\')
#print(dirnames)

#请求网页
def get_html(url):
r = requests.get(url, headers=headers)
if r.status_code == 200:
html = r.text
else:
return None
return html

#获取分类标签列表
def get_lable_url_list(url):

html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
list_a = soup.findAll('a')
list_url_lable = []
num = 0
for i in range(3, 28):
num += 1
url_new = urljoin(url, list_a[i]['href'])
try:
path = "E:\\czxt\\" + url_new.split('/')[-2] + "\\"
os.mkdir(path)
dirnames.append(path)
except:
pass
list_url_lable.append(url_new)
#print(sorted(list_url_lable))
#dirnames=sorted(dirnames)
#print(num, end='-->')
#print(url_new, path)
return sorted(list_url_lable),sorted(dirnames)

#获取单页图片
def get_one_page(url,k=0):
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
list = soup.findAll('img')
#print(list)
for i in list:
str1 = i['src']
try:
if len(str1) >= 89:
str_list = str1.split('small')
str_r = str_list[1].split('.')[0][:32] + ".jpg"
pic_url = str_list[0] + str_r
path = sorted(dirnames)[k] + str_r
else:
pic_url=str1
path=sorted(dirnames)[k]+str1.split('/')[-1]
r1 = requests.get(pic_url, headers=headers)
# print(dirnames)
#path = sorted(dirnames)[k] + str_r
print(path)
store_one_page(path, r1.content)
except:
pass


#存储图片
def store_one_page(path, content):
with open(path, 'wb') as f:
f.write(content)
f.close()

#获取单标签网页列表
def get_onelable_url_list(url):
one_lable_url_list = []
html = get_html(url)
try:
totalpages = re.findall('/span(.*?)\>(\d+)\</a(.*?)class', html)[0][1] # 获取分类标签下总页数

for i in range(1, int(totalpages) + 1):
if i == 1:
url_new = url
else:
url_new = url + 'index_' + str(i) + '.htm'
one_lable_url_list.append(url_new)
except:
pass
# print(one_lable_url_list)
return one_lable_url_list


#lable_url_list,dirnames=get_lable_url_list(url)
# get_onelable_url_list(url)
#get_one_page("http://www.netbian.com/rili/")
def main():
lable_url_list,dirnames = get_lable_url_list(url)
print(dirnames)
length=len(lable_url_list)
#print(lable_url_list[0])
for k in range(0,length):
print('正方访问{}---分类标签网址{}'.format(k,lable_url_list[k]))
one_lable_url_list=get_onelable_url_list(lable_url_list[k])

for j in one_lable_url_list:
print('正在获({})页资源'.format(j))
get_one_page(j,k=k)
print('获取({})页资源完毕'.format(j))
main()
posted @ 2021-01-16 20:35  eihouwang  阅读(176)  评论(0编辑  收藏  举报