爬取斗图吧图片
import requests # 模拟请求模块
from bs4 import BeautifulSoup # 格式化模块
import re # 正则模块
import os # 文件操作模块
while True:
page = 1 # 记录页数
url = 'https://www.doutula.com/article/list/?page=%s'%page # 拼接地址
headers = {
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
res = requests.get(url=url,headers=headers) # 请求主页列表数据
soup = BeautifulSoup(res.text,'lxml')
res = soup.prettify()
# 用正则匹配出每一个小块的地址
cres = re.findall('<a class="list-group-item random_list tg-article" href="(.*?)">',res)
# 判断是否拥有内容,如果没有结束程序
if cres == []:
print('爬取结束')
break
for i in cres: # 循环并打开每一个小块
res1 = requests.get(url=i,headers=headers)
soup = BeautifulSoup(res1.text, 'lxml')
res1 = soup.prettify()
name1 = soup.h1.text # 获取每一个小块的名称
# 用正则获取每一张图片的具体地址
a = """<img alt="" onerror="this.src='(.*?)'"""
jpgurl = re.findall(a, res1)
for l in jpgurl: # 循环取出图片地址
# 切片地址取出图片名称与后缀名
name2 = l.rsplit('/',1)[1]
# 拼接完整的保存路径
name = name1 + name2
add = os.path.join('D:\斗图啦', name)
# 获取文件的集体数据
jpg = requests.get(url=l,headers=headers)
# 逐行保存至本地
with open(add,mode='wb') as f:
for line in jpg.iter_content():
f.write(line)
else:
page += 1 # 开始下一页