Python借助Selenium,保留原样式拷贝网站资源


import urllib3
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import requests
from urllib.parse import urljoin, urlparse
import base64
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# 屏蔽 InsecureRequestWarning 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 创建保存文件的目录
save_dir = 'website_copy'
os.makedirs(save_dir, exist_ok=True)

# 设置Chrome浏览器的驱动路径
driver_path = 'chromedriver-win64/chromedriver.exe'

# 初始化Selenium
driver = webdriver.Chrome(executable_path=driver_path)

# 访问目标网站
url = 'https://example.com/'
driver.get(url)

# 获取页面源代码
page_source = driver.page_source

# 获取页面的URL,用于后续的相对路径处理
base_url = driver.current_url

# 关闭浏览器
driver.quit()


# 下载文件的函数,包含对data URL的处理,并保留原始目录结构
def download_file(url, save_dir):
    """下载文件并保存到指定目录,保留原始目录结构"""
    if url.startswith('data:'):
        # 处理 data URL
        match = re.match(r'data:(.*?);base64,(.*)', url)
        if match:
            mime_type = match.group(1)
            data = match.group(2)
            file_extension = mime_type.split('/')[-1]
            file_name = os.path.join(save_dir, f"embedded_image.{file_extension}")
            try:
                with open(file_name, 'wb') as file:
                    file.write(base64.b64decode(data))
                return file_name
            except Exception as e:
                print(f"Error saving embedded data URL: {e}")
    else:
        # 设置重试机制
        session = requests.Session()
        retry = Retry(total=5, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('https://', adapter)
        session.mount('http://', adapter)

        try:
            response = session.get(url, verify=False)  # 禁用SSL验证
            if response.status_code == 200:
                # 构造保存路径,保留原始目录结构
                parsed_url = urlparse(url)
                file_path = os.path.join(save_dir, parsed_url.path.lstrip('/'))
                file_dir = os.path.dirname(file_path)
                os.makedirs(file_dir, exist_ok=True)

                # 保存文件
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                return parsed_url.path
        except Exception as e:
            print(f"Error downloading {url}: {e}")
    return None


# 解析HTML
soup = BeautifulSoup(page_source, 'html.parser')

# 处理CSS文件、JS文件、图片等资源
for tag in soup.find_all(['link', 'script', 'img']):
    if tag.name == 'link' and tag.get('rel') == ['stylesheet']:
        # CSS文件
        css_url = urljoin(base_url, tag['href'])
        local_css = download_file(css_url, save_dir)
        if local_css:
            tag['href'] = local_css

    elif tag.name == 'script' and tag.get('src'):
        # JS文件
        js_url = urljoin(base_url, tag['src'])
        local_js = download_file(js_url, save_dir)
        if local_js:
            tag['src'] = local_js

    elif tag.name == 'img' and tag.get('src'):
        # 图片文件
        img_url = urljoin(base_url, tag['src'])
        local_img = download_file(img_url, save_dir)
        if local_img:
            tag['src'] = local_img

# 保存修改后的HTML文件
with open(os.path.join(save_dir, 'index.html'), 'w', encoding='utf-8') as file:
    file.write(str(soup))

print("Website resources have been successfully copied and saved to:", save_dir)


posted @ 2024-08-16 14:36  _迷途  阅读(14)  评论(0编辑  收藏  举报