爬取图片

步骤

1.拿到页面源代码,然后提取子页面的链接地址,href
2.通过href拿到子页面内容,从子页面找到图片的下载地址 img->src
3.下载图片

import requests
from bs4 import BeautifulSoup
import re
import time
url="https://moetu.club/category/illustration"
header={
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 QuarkPC/1.9.5.160"
}

resp=requests.get(url,headers=header)
resp.encoding='utf-8'

#把源代码交给bs
main_page=BeautifulSoup(resp.text,"html.parser")
archive_row=main_page.find("div",attrs={"class":"archive-row"})
obj=re.compile(f"<div class=\"post-info\">.*?<h2><a.*?href=\"(?P<address>.*?)\">(?P<title>.*?)</a></h2>",re.S)
ret=obj.finditer(str(archive_row))
for i in ret:
    #print(i.group("address")+i.group("title"))
    #拿到子页面的源代码
    child_page_resp=requests.get(i.group("address"),headers=header)
    child_page_text=child_page_resp.text
    #print(child_page_text)
    #从子页面拿到图片的下载路径
    child_page=BeautifulSoup(child_page_resp.text,"html.parser")
    p=child_page_content=child_page.find("div",attrs={"class":"entry-content"})
    #print(child_page_content)
    # obj2=re.compile(f"<img alt=.*?class=.*?data-src=\"(?P<address2>.*?)\" decoding=\"async\" src=.*?><",re.S)
    # ret2=obj2.finditer(str(p))
    # for j in ret2:
    #     print(j.group("address2"))



    img=p.find("img")
    data=img.get("data-src")
#     #下载图片
    img_resp=requests.get(data)
#     #img_resp.content #这里拿到的是字节
    img_name=data.split("/")[-1]#拿到url中的最后一个/以后的内容
    with open("img_/"+img_name,mode="wb") as f:
        f.write(img_resp.content) #图片内容写入文件
        print("over!"+img_name)
        time.sleep(0.25)
print("all over!")
posted @ 2024-11-28 23:17  屈臣  阅读(15)  评论(0编辑  收藏  举报