实现爬取图片
from requests_html import HTMLSession
session = HTMLSession()
BASE_URL = 'https://www.ivsky.com'
# 获取图片页码链接
def get_page_url():
for i in range(1,5): # 事先知道一共有多少页,将参数修改即可
yield 'https://www.ivsky.com/tupian/ziranfengguang/index_{}.html'.format(i)
# 获取总图的链接 ==》》 单个图的所有
# 测试:
# r = session.get(url='https://www.ivsky.com/tupian/ziranfengguang/index_1.html')
#
# BASE_URL = 'https://www.ivsky.com'
# element_list = r.html.find('.il_img a')
# for element in element_list:
# # print(element.attrs.get('href'))
# a_url = BASE_URL + element.attrs.get('href')
# print(a_url)
# title = element.attrs.get('title')
# # 进入到具体的图片内部
# h = session.get(url=a_url)
# element_list = h.html.find('.il_img img')
# for element in element_list:
# url = element.attrs.get('src')[15:]
# url_detail = BASE_URL + url
# print(url_detail)
def get_url_page(url):
r = session.get(url=url)
element_list = r.html.find('.il_img a')
for element in element_list:
a_url = BASE_URL + element.attrs.get('href')
title = element.attrs.get('title')
# 进入到具体的图片内部
h = session.get(url=a_url)
element_list = h.html.find('.il_img img')
length = len(element_list) # 判断长度此处无用!
leng = 0
for element in element_list:
url = element.attrs.get('src')[15:]
url_detail = BASE_URL + url
leng += 1
name = title + '第{}张'.format(str(leng))
save(url_detail,title,name)
# 文件夹下的名字有所区别 即风景图片/具体的标题/单个的图片.png,因此传name字段过去
import os
def save(url,title,name):
base_url = '风景图片'
file_path = os.path.join(base_url,title)
if not os.path.exists(file_path):
os.makedirs(file_path)
file_path2 = os.path.join(file_path,name + '.png')
r = session.get(url=url)
with open(file_path2,'wb')as f:
f.write(r.content)
print('{}图片保存成功'.format(name))
if __name__ == '__main__':
for page_url in get_page_url():
get_url_page(page_url)
# 可以考虑单行打印进度条 前提是知道所有的图片长度不好弄
# from tqdm import tqdm
# import time
#
# pbar = tqdm(total=100,desc='michael')
# for i in range(100):
# pbar.update(1)
# time.sleep(0.05)
# pbar.close()
posted on 2019-08-08 19:28 michael-chang 阅读(192) 评论(0) 编辑 收藏 举报