python 单线程图片下载
import urllib.request import urllib.parse import urllib.error import re import os import ssl ssl._create_default_https_context = ssl._create_unverified_context path = "./images" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "referer": "https://www.mzitu.com/xinggan/" } def handler_request(url, pageIndex): url = url + str(pageIndex) # 构建请求对象 request = urllib.request.Request(url=url, headers=headers) return request def get_images_url(content, basePath): patternNames = re.compile(r'<li>.*?<img .* alt=(.*?) .*? />.*?') patternHrefs = re.compile(r'<li><a href=(.*?) .*?>.*?') alts = patternNames.findall(content, re.S) hrefs = patternHrefs.findall(content, re.S) image_map = {} for i in range(len(hrefs)): key = alts[i][1: len(alts[i]) - 1] image_map[key] = hrefs[i] for item in image_map.items(): image_category_response(item, basePath) def image_category_response(item, basePath): alt = item[0] save_folder = os.path.join(basePath, alt) if not os.path.exists(save_folder): os.mkdir(save_folder) baseurl = item[1][1: len(item[1]) -1] pageCount = 1000 try: for pageIndex in range(pageCount): page_url = baseurl + "/" + str(pageIndex) try: # 构建请求对象 request = urllib.request.Request(url=page_url, headers=headers) # 发送请求 response = urllib.request.urlopen(request) content = response.read().decode() imgPattern = re.compile(r'<div class="main-image"><p>.*?<img src=(.*?) .*? />.*?') imgUrl = imgPattern.findall(content, re.S) download_images(imgUrl[0], save_folder) except urllib.error.URLError as e: raise TypeError("最大页面数{0}".format(pageIndex - 1)) except Exception as e: print(e) def download_images(url, save_path): url = url[1: len(url) - 1] print(url) # 构建请求对象 request = urllib.request.Request(url=url, headers=headers) # 发送请求 response = urllib.request.urlopen(request) filename = url.split('/')[-1] with open(os.path.join(save_path, filename), 'wb') as fb: fb.write(response.read()) def parse_pages(content): print(content) def main(): url = 'https://www.mzitu.com/xinggan/page/' start_page = int(input("请输入起始页码:")) end_page = int(input("请输入结束页码:")) # 创建根文件夹 if not os.path.exists(path): os.mkdir(path) for pageIndex in range(start_page, end_page + 1): print("...........开始下载第{0}页".format(pageIndex)) # 创建文件夹 save_path = create_folder(pageIndex) # 生成request request = handler_request(url, pageIndex) # 发送请求对象,获取相应内容 response = urllib.request.urlopen(request) content = response.read().decode() # 解析内容,提取图片并且下载 get_images_url(content, save_path) print("...........结束下载第{0}页".format(pageIndex)) def create_folder(pageIndex): save_path = os.path.join(path, str(pageIndex)) if not os.path.exists(save_path): os.mkdir(save_path) return save_path.replace("\\", "/") + "/" if __name__ == "__main__": main()
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 从二进制到误差:逐行拆解C语言浮点运算中的4008175468544之谜
· .NET制作智能桌面机器人:结合BotSharp智能体框架开发语音交互
· 软件产品开发中常见的10个问题及处理方法
· .NET 原生驾驭 AI 新基建实战系列:向量数据库的应用与畅想
· 从问题排查到源码分析:ActiveMQ消费端频繁日志刷屏的秘密
· Windows桌面应用自动更新解决方案SharpUpdater5发布
· 我的家庭实验室服务器集群硬件清单
· C# 13 中的新增功能实操
· Supergateway:MCP服务器的远程调试与集成工具
· Vue3封装支持Base64导出的电子签名组件