import requests
from bs4 import BeautifulSoup
url = 'http://www.umeituku.com/bizhitupian/meinvbizhi/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'
}
resp = requests.get(url=url, headers=headers)
resp.encoding = 'utf-8'
# 1.把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text, 'html.parser') # 设置解析器
# 2.定位具体位置
# 第一个find('div',class_='TypeList') 只能找到外圈一层,如果此时打印出来,不好处理,有很多杂乱的信息
# 第二个find_all('a') 在前面的基础上找到每一个标签 a 返回成列表
alist = page.find('div', class_='TypeList').find_all('a') # class是python关键字,所以要写成class_
for a in alist:
# 得到页面的每一个下一层地址
# 获取满足条件的每个a标签中属性‘href’的值
href = a.get('href')
# print(href)
while True:
# 获取下一层的页面
resp2 = requests.get(url=href, headers=headers)
resp2.encoding = 'utf-8'
# 生成bs4对象
page2 = BeautifulSoup(resp2.text, 'html.parser')
# 定位
'''
# 1.通过page2.find('div',class_="ImageBody") 定位到下面这段
<div class="ImageBody" id="ArticleId60">
<p align="center">
<a href="203957_2.htm">
<img alt="" src="https://i1.huishahe.com/uploads/tu/201911/9999/d0fcb718a2.jpg"/>
</a>
</p>
</div>
# 2.再find('img')找到
<img alt="" src="https://i1.huishahe.com/uploads/tu/201911/9999/d0fcb718a2.jpg"/>
# 3.再get得到
https://i1.huishahe.com/uploads/tu/201911/9999/d0fcb718a2.jpg
讲究一个循环渐进
'''
# 如果本页没有找到src报AttributeError错误,说明到底了,就结束本次循环
try:
src = page2.find('div', class_="ImageBody").find('img').get('src')
except AttributeError as at:
break
# 下载图片
img_resp = requests.get(url=src, headers=headers)
# print(src)
# 取个文件名
name = src.split('/')[-1]
with open('other/tupian/' + name, mode='wb') as f:
f.write(img_resp.content)
print(name + '下载成功!')
# 如果没有下一页报AttributeError错误,就停止本次循环。
try:
next_href = page2.find('div', class_="ImageBody").find('a').get('href')
except AttributeError as at:
break
href = 'http://www.umeituku.com/bizhitupian/meinvbizhi/' + next_href
# print(href)
resp2.close()
img_resp.close()
resp.close()