Python借助Selenium,保留原样式拷贝网站资源
import urllib3 from selenium import webdriver from bs4 import BeautifulSoup import os import requests from urllib.parse import urljoin, urlparse import base64 import re from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # 屏蔽 InsecureRequestWarning 警告 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # 创建保存文件的目录 save_dir = 'website_copy' os.makedirs(save_dir, exist_ok=True) # 设置Chrome浏览器的驱动路径 driver_path = 'chromedriver-win64/chromedriver.exe' # 初始化Selenium driver = webdriver.Chrome(executable_path=driver_path) # 访问目标网站 url = 'https://example.com/' driver.get(url) # 获取页面源代码 page_source = driver.page_source # 获取页面的URL,用于后续的相对路径处理 base_url = driver.current_url # 关闭浏览器 driver.quit() # 下载文件的函数,包含对data URL的处理,并保留原始目录结构 def download_file(url, save_dir): """下载文件并保存到指定目录,保留原始目录结构""" if url.startswith('data:'): # 处理 data URL match = re.match(r'data:(.*?);base64,(.*)', url) if match: mime_type = match.group(1) data = match.group(2) file_extension = mime_type.split('/')[-1] file_name = os.path.join(save_dir, f"embedded_image.{file_extension}") try: with open(file_name, 'wb') as file: file.write(base64.b64decode(data)) return file_name except Exception as e: print(f"Error saving embedded data URL: {e}") else: # 设置重试机制 session = requests.Session() retry = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504]) adapter = HTTPAdapter(max_retries=retry) session.mount('https://', adapter) session.mount('http://', adapter) try: response = session.get(url, verify=False) # 禁用SSL验证 if response.status_code == 200: # 构造保存路径,保留原始目录结构 parsed_url = urlparse(url) file_path = os.path.join(save_dir, parsed_url.path.lstrip('/')) file_dir = os.path.dirname(file_path) os.makedirs(file_dir, exist_ok=True) # 保存文件 with open(file_path, 'wb') as file: file.write(response.content) return parsed_url.path except Exception as e: print(f"Error downloading {url}: {e}") return None # 解析HTML soup = BeautifulSoup(page_source, 'html.parser') # 处理CSS文件、JS文件、图片等资源 for tag in soup.find_all(['link', 'script', 'img']): if tag.name == 'link' and tag.get('rel') == ['stylesheet']: # CSS文件 css_url = urljoin(base_url, tag['href']) local_css = download_file(css_url, save_dir) if local_css: tag['href'] = local_css elif tag.name == 'script' and tag.get('src'): # JS文件 js_url = urljoin(base_url, tag['src']) local_js = download_file(js_url, save_dir) if local_js: tag['src'] = local_js elif tag.name == 'img' and tag.get('src'): # 图片文件 img_url = urljoin(base_url, tag['src']) local_img = download_file(img_url, save_dir) if local_img: tag['src'] = local_img # 保存修改后的HTML文件 with open(os.path.join(save_dir, 'index.html'), 'w', encoding='utf-8') as file: file.write(str(soup)) print("Website resources have been successfully copied and saved to:", save_dir)
分类:
Python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
2021-08-16 Python判断当前时间是否在指定时间之间
2021-08-16 Python字符串型强转整型