bs4解析-优美图库


import requests
from bs4 import BeautifulSoup

url = 'http://www.umeituku.com/bizhitupian/meinvbizhi/'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41'
}

resp = requests.get(url=url, headers=headers)
resp.encoding = 'utf-8'

# 1.把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text, 'html.parser')  # 设置解析器

# 2.定位具体位置
# 第一个find('div',class_='TypeList') 只能找到外圈一层,如果此时打印出来,不好处理,有很多杂乱的信息
# 第二个find_all('a') 在前面的基础上找到每一个标签 a 返回成列表
alist = page.find('div', class_='TypeList').find_all('a')  # class是python关键字,所以要写成class_

for a in alist:
    # 得到页面的每一个下一层地址
    # 获取满足条件的每个a标签中属性‘href’的值
    href = a.get('href')
    # print(href)

    while True:
        # 获取下一层的页面
        resp2 = requests.get(url=href, headers=headers)
        resp2.encoding = 'utf-8'

        # 生成bs4对象
        page2 = BeautifulSoup(resp2.text, 'html.parser')

        # 定位
        '''
        # 1.通过page2.find('div',class_="ImageBody") 定位到下面这段
        <div class="ImageBody" id="ArticleId60">
            <p align="center">
                <a href="203957_2.htm">
                    <img alt="" src="https://i1.huishahe.com/uploads/tu/201911/9999/d0fcb718a2.jpg"/>
                </a>
            </p>
        </div>
        # 2.再find('img')找到
        <img alt="" src="https://i1.huishahe.com/uploads/tu/201911/9999/d0fcb718a2.jpg"/>
        # 3.再get得到
        https://i1.huishahe.com/uploads/tu/201911/9999/d0fcb718a2.jpg
        讲究一个循环渐进
        '''
        # 如果本页没有找到src报AttributeError错误,说明到底了,就结束本次循环
        try:
            src = page2.find('div', class_="ImageBody").find('img').get('src')
        except AttributeError as at:
            break

        # 下载图片
        img_resp = requests.get(url=src, headers=headers)
        # print(src)
        # 取个文件名
        name = src.split('/')[-1]
        with open('other/tupian/' + name, mode='wb') as f:
            f.write(img_resp.content)
        print(name + '下载成功!')

        # 如果没有下一页报AttributeError错误,就停止本次循环。
        try:
            next_href = page2.find('div', class_="ImageBody").find('a').get('href')
        except AttributeError as at:
            break
        href = 'http://www.umeituku.com/bizhitupian/meinvbizhi/' + next_href
        # print(href)
        resp2.close()
        img_resp.close()
resp.close()
posted @ 2023-03-01 22:49  0x1e61  阅读(41)  评论(0编辑  收藏  举报