爬虫——斗图啦

爬取斗图吧图片

import requests     # 模拟请求模块
from bs4 import BeautifulSoup   # 格式化模块
import re   # 正则模块
import os   # 文件操作模块

while True:
    page = 1     # 记录页数
    url = 'https://www.doutula.com/article/list/?page=%s'%page  # 拼接地址
    headers = {
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    res = requests.get(url=url,headers=headers)     # 请求主页列表数据
    soup = BeautifulSoup(res.text,'lxml')
    res = soup.prettify()
    # 用正则匹配出每一个小块的地址
    cres = re.findall('<a class="list-group-item random_list tg-article" href="(.*?)">',res)
    # 判断是否拥有内容,如果没有结束程序
    if cres == []:
        print('爬取结束')
        break

    for i in cres:  # 循环并打开每一个小块
        res1 = requests.get(url=i,headers=headers)
        soup = BeautifulSoup(res1.text, 'lxml')
        res1 = soup.prettify()
        name1 = soup.h1.text    # 获取每一个小块的名称
        # 用正则获取每一张图片的具体地址
        a = """<img alt="" onerror="this.src='(.*?)'"""
        jpgurl = re.findall(a, res1)

        for l in jpgurl:    # 循环取出图片地址
            # 切片地址取出图片名称与后缀名
            name2 = l.rsplit('/',1)[1]
            # 拼接完整的保存路径
            name = name1 + name2
            add = os.path.join('D:\斗图啦', name)
            # 获取文件的集体数据
            jpg = requests.get(url=l,headers=headers)
            # 逐行保存至本地
            with open(add,mode='wb') as f:
                for line in jpg.iter_content():
                    f.write(line)
        else:
            page += 1   # 开始下一页

posted @ 2020-09-23 14:40  最冷不过冬夜  阅读(126)  评论(0编辑  收藏  举报