python使用selenium和PyPDF2保存多个html页面为pdf

检索资料时看到比较完备的资料,想着要把所有页面保存下来。正好使用下requests和BeautifulSoup库获取和解析所有的静态页,把静态页保存为单个pdf文件,然后再把所有的pdf文件合并起来生成1个PDF文档。

本来想使用python子进程调用wkhtmltopdf工具把静态页生成为单个pdf,然而如此一来pdf上每页必然有当前html页面的导航和目录等不想要的内容,然后看到了页面上有下载为pdf的按钮,下载下来的文件只包含了当前页面的内容,这样就很符合期待了。然而每个页面这样操作也是大大的工程量,更别说合并pdf的顺序问题了...

因此唯一的方案就是看看有没有什么方法可以模拟用户操作浏览器打印当前页面-保存为pdf的操作了。

进一步检索资料发现可以用selenium自动化测试框架来实现

将html页面保存为pdf

import requests, re, os, random
import json, time
from selenium import webdriver

pdf_dir = "dir"

# 下载pdf
def downloadPdf(params):
    time.sleep(random.randint(1, 5))  # 延时几秒开始下载

    title, url = params
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--enable-print-browser")
    chrome_options.add_argument("--kiosk-printing")
    settings = {
        "recentDestinations": [{"id": "Save as PDF", "origin": "local"}],
        "selectedDestinationId": "Save as PDF",
        "version": 2,
        "isHeaderFooterEnabled": False,
        "isCssBackgroundEnabled": False,
        "mediaSize": {"height_microns": 297000, "width_microns": 210000, "name": "ISO_A4", "custom_display_name": "A4"},
    }
    prefs = {
        "printing.print_preview_sticky_settings.appState": json.dumps(settings),
        "savefile.default_directory": pdf_dir,  # 下载文件保存的路径
    }
    chrome_options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    driver.execute_script("document.title='" + title + "';window.print();")
    time.sleep(3)  # 等待页面加载完成

获取页面的目录url并将url页面保存为单个pdf文档

获取页面,并解析页面拿到目录的url地址

import requests
from bs4 import BeautifulSoup

main_url = "url"


def getUrls():
    titles = []
    urls = []

    url_begin = main_url + "index.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    }
    response = requests.get(url_begin, headers=headers)
    if response.status_code == 200:
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.content, "lxml")

        titles_exc = []
        elems_l1 = soup.find_all("li", class_=re.compile("toctree-l1"))
        for elem in elems_l1:
            title_elem = elem.find("a", class_=re.compile("reference internal"))
            title = re.sub(r"\s+", "", title_elem.get_text().strip())
            titles_exc.append(title)
        elems_l2 = soup.find_all("li", class_=re.compile("toctree-l2"))
        for elem in elems_l2:
            title_elem = elem.find("a", class_=re.compile("reference internal"))
            title = re.sub(r"\s+", "", title_elem.get_text().strip())
            titles_exc.append(title)

        elems = soup.find_all("a", class_=re.compile("reference internal"))
        for elem in elems:
            title = re.sub(r"\s+", "", elem.get_text().strip())
            if elem.attrs["href"] == "#":
                url = url_begin
            elif elem.attrs["href"].startswith("#"):
                continue
            else:
                url = main_url + elem.attrs["href"]
            urls.append(url)
            titles.append(title)

    return titles, urls, titles_exc

将上面解析得到url地址全部保存为单个pdf文档

import concurrent.futures

titles, urls, titles_exc = getUrls()

# 线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.map(downloadPdf, list(zip(titles, urls)))

    # 等待池内所有任务执行完毕
    executor.shutdown(wait=True)

 合并单个pdf为1个pdf文件

import PyPDF2


def list_files(dir, titles):
    filelists = []
    for title in titles:
        file = os.path.join(dir, title + ".pdf")
        if os.path.exists(file):
            filelists.append(file)
    return filelists


# 合并PDF
def merge_pdfs(titles, paths, titles_exc, output):
    pdf_writer = PyPDF2.PdfWriter()

    page_num = 0
    for idx, path in enumerate(paths):
        pdf_reader = PyPDF2.PdfReader(path)
        title = titles[idx]

        if title not in titles_exc:
            for page in range(len(pdf_reader.pages)):
                pdf_writer.add_page(pdf_reader.pages[page])

            # 书签目录
            pdf_writer.add_outline_item(title, page_num)

            page_num += len(pdf_reader.pages)
        else:
            # 书签目录
            pdf_writer.add_outline_item(title, page_num)

    with open(output, "wb") as out:
        pdf_writer.write(out)
        
        
# 文档顺序
file_list = list_files(pdf_dir, titles)

# pdf合并
merge_pdfs(titles, file_list, titles_exc, output=os.path.join(pdf_dir, "output.pdf"))

完整代码

查看代码
 import requests, re, os, random
from bs4 import BeautifulSoup
import json, time
from selenium import webdriver
import PyPDF2
import concurrent.futures

pdf_dir = "dir"
main_url = "url"


def getUrls():
    titles = []
    urls = []

    url_begin = main_url + "index.html"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    }
    response = requests.get(url_begin, headers=headers)
    if response.status_code == 200:
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.content, "lxml")

        titles_exc = []
        elems_l1 = soup.find_all("li", class_=re.compile("toctree-l1"))
        for elem in elems_l1:
            title_elem = elem.find("a", class_=re.compile("reference internal"))
            title = re.sub(r"\s+", "", title_elem.get_text().strip())
            titles_exc.append(title)
        elems_l2 = soup.find_all("li", class_=re.compile("toctree-l2"))
        for elem in elems_l2:
            title_elem = elem.find("a", class_=re.compile("reference internal"))
            title = re.sub(r"\s+", "", title_elem.get_text().strip())
            titles_exc.append(title)

        elems = soup.find_all("a", class_=re.compile("reference internal"))
        for elem in elems:
            title = re.sub(r"\s+", "", elem.get_text().strip())
            if elem.attrs["href"] == "#":
                url = url_begin
            elif elem.attrs["href"].startswith("#"):
                continue
            else:
                url = main_url + elem.attrs["href"]
            urls.append(url)
            titles.append(title)

    return titles, urls, titles_exc


# 下载pdf
def downloadPdf(params):
    time.sleep(random.randint(1, 5))  # 延时几秒开始下载

    title, url = params
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--enable-print-browser")
    chrome_options.add_argument("--kiosk-printing")
    settings = {
        "recentDestinations": [{"id": "Save as PDF", "origin": "local"}],
        "selectedDestinationId": "Save as PDF",
        "version": 2,
        "isHeaderFooterEnabled": False,
        "isCssBackgroundEnabled": False,
        "mediaSize": {"height_microns": 297000, "width_microns": 210000, "name": "ISO_A4", "custom_display_name": "A4"},
    }
    prefs = {
        "printing.print_preview_sticky_settings.appState": json.dumps(settings),
        "savefile.default_directory": pdf_dir,  # 下载文件保存的路径
    }
    chrome_options.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    driver.execute_script("document.title='" + title + "';window.print();")
    time.sleep(3)  # 等待页面加载完成


def list_files(dir, titles):
    filelists = []
    for title in titles:
        file = os.path.join(dir, title + ".pdf")
        if os.path.exists(file):
            filelists.append(file)
    return filelists


# 合并PDF
def merge_pdfs(titles, paths, titles_exc, output):
    pdf_writer = PyPDF2.PdfWriter()

    page_num = 0
    for idx, path in enumerate(paths):
        pdf_reader = PyPDF2.PdfReader(path)
        title = titles[idx]

        if title not in titles_exc:
            for page in range(len(pdf_reader.pages)):
                pdf_writer.add_page(pdf_reader.pages[page])

            # 书签目录
            pdf_writer.add_outline_item(title, page_num)

            page_num += len(pdf_reader.pages)
        else:
            # 书签目录
            pdf_writer.add_outline_item(title, page_num)

    with open(output, "wb") as out:
        pdf_writer.write(out)


titles, urls, titles_exc = getUrls()

# 线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
    executor.map(downloadPdf, list(zip(titles, urls)))

    # 等待池内所有任务执行完毕
    executor.shutdown(wait=True)

# 文档顺序
file_list = list_files(pdf_dir, titles)

# pdf合并
merge_pdfs(titles, file_list, titles_exc, output=os.path.join(pdf_dir, "output.pdf"))

 

posted @ 2024-08-08 14:06  carol2014  阅读(3)  评论(0编辑  收藏  举报