Python借助Selenium,保留原样式拷贝网站资源

import urllib3
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import requests
from urllib.parse import urljoin, urlparse
import base64
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
# 屏蔽 InsecureRequestWarning 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 创建保存文件的目录
save_dir = 'website_copy'
os.makedirs(save_dir, exist_ok=True)
# 设置Chrome浏览器的驱动路径
driver_path = 'chromedriver-win64/chromedriver.exe'
# 初始化Selenium
driver = webdriver.Chrome(executable_path=driver_path)
# 访问目标网站
url = 'https://example.com/'
driver.get(url)
# 获取页面源代码
page_source = driver.page_source
# 获取页面的URL,用于后续的相对路径处理
base_url = driver.current_url
# 关闭浏览器
driver.quit()
# 下载文件的函数,包含对data URL的处理,并保留原始目录结构
def download_file(url, save_dir):
"""下载文件并保存到指定目录,保留原始目录结构"""
if url.startswith('data:'):
# 处理 data URL
match = re.match(r'data:(.*?);base64,(.*)', url)
if match:
mime_type = match.group(1)
data = match.group(2)
file_extension = mime_type.split('/')[-1]
file_name = os.path.join(save_dir, f"embedded_image.{file_extension}")
try:
with open(file_name, 'wb') as file:
file.write(base64.b64decode(data))
return file_name
except Exception as e:
print(f"Error saving embedded data URL: {e}")
else:
# 设置重试机制
session = requests.Session()
retry = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)
session.mount('http://', adapter)
try:
response = session.get(url, verify=False) # 禁用SSL验证
if response.status_code == 200:
# 构造保存路径,保留原始目录结构
parsed_url = urlparse(url)
file_path = os.path.join(save_dir, parsed_url.path.lstrip('/'))
file_dir = os.path.dirname(file_path)
os.makedirs(file_dir, exist_ok=True)
# 保存文件
with open(file_path, 'wb') as file:
file.write(response.content)
return parsed_url.path
except Exception as e:
print(f"Error downloading {url}: {e}")
return None
# 解析HTML
soup = BeautifulSoup(page_source, 'html.parser')
# 处理CSS文件、JS文件、图片等资源
for tag in soup.find_all(['link', 'script', 'img']):
if tag.name == 'link' and tag.get('rel') == ['stylesheet']:
# CSS文件
css_url = urljoin(base_url, tag['href'])
local_css = download_file(css_url, save_dir)
if local_css:
tag['href'] = local_css
elif tag.name == 'script' and tag.get('src'):
# JS文件
js_url = urljoin(base_url, tag['src'])
local_js = download_file(js_url, save_dir)
if local_js:
tag['src'] = local_js
elif tag.name == 'img' and tag.get('src'):
# 图片文件
img_url = urljoin(base_url, tag['src'])
local_img = download_file(img_url, save_dir)
if local_img:
tag['src'] = local_img
# 保存修改后的HTML文件
with open(os.path.join(save_dir, 'index.html'), 'w', encoding='utf-8') as file:
file.write(str(soup))
print("Website resources have been successfully copied and saved to:", save_dir)
posted @   _迷途  阅读(22)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
历史上的今天:
2021-08-16 Python判断当前时间是否在指定时间之间
2021-08-16 Python字符串型强转整型
点击右上角即可分享
微信分享提示