使用playwright控制浏览器在服务器端将网页转化为PDF文件

问题

在服务端常有需要对页面进行转PDF文件

代码

requirements.txt

点击查看代码

 playwright

convert_pdf.py

点击查看代码

 from playwright.sync_api import sync_playwright, Playwright
import argparse
from typing import Dict
 
def run(playwright: Playwright, url: str, path: str, timeout: int, cookie_str: str = None):
    chromium = playwright.chromium
    browser = chromium.launch()
    context = browser.new_context()
 
    if cookie_str is not None:
        # 获取域名中的域名
        demain = get_address_domain(url)
 
        for key, value in convert_str_cookie_to_dict(cookie_str).items():
            context.add_cookies([{
                'name': key,
                'value': value,
                'domain': demain,
                'path': "/"
            }])
 
    page = context.new_page()
    page.goto(url=url, timeout=timeout)
    page.emulate_media(media="print")
    page.pdf(path=path, format="A4", outline=True, margin=dict(top="35px", right="35px", bottom="35px", left="35px"))
    browser.close()
 
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
    cookie_dict: Dict[str, str] = dict()
    if not cookie_str:
        return cookie_dict
    for cookie in cookie_str.split(";"):
        cookie = cookie.strip()
        if not cookie:
            continue
        cookie_list = cookie.split("=")
        if len(cookie_list) != 2:
            continue
        cookie_value = cookie_list[1]
        if isinstance(cookie_value, list):
            cookie_value = "".join(cookie_value)
        cookie_dict[cookie_list[0]] = cookie_value
    return cookie_dict
 
def get_address_domain(url: str) -> str:
    if not url:
        return ""
    if url.startswith("http://"):
        url = url.replace("http://", "")
    if url.startswith("https://"):
        url = url.replace("https://", "")
    return url.split("/")[0]
 
with sync_playwright() as playwright:
    parser = argparse.ArgumentParser(description='Convert PDF')
    parser.add_argument('-u', '--url', type=str, required = True, help='Need to convert PDF file network address')
    parser.add_argument('-p', '--path',  type=str, required = True, help='save file path')
    parser.add_argument('-t', '--timeout', type=int, help='timeout(Unit millisecond), defualt 30000 ', default=30000)
    parser.add_argument('-c', '--cookie', type=str, help='cookie string，For maintaining logins . eg: key1=value1; key2=value2; key3=value3')
    args = parser.parse_args()
 
    if args.timeout < 1000:
        print("error: Please enter the correct timeout period in milliseconds.")
        exit(0)
 
    run(playwright, url=args.url, path=args.path, timeout=args.timeout, cookie_str=args.cookie)

使用以下命令安装依赖

python install -r requirements.txt

playwright install

等待安装完成后，在使用下列命令转化即可
python .\convert_pdf.py -u https://www.baidu.com --path ./page8.pdf

posted @ 2024-05-15 17:55 east_ebony 阅读(204) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

昵称： east_ebony
园龄： 6年10个月
粉丝： 0
关注： 24

2025年2月

日

一

二

三

四

五

六

anebony

使用playwright控制浏览器在服务器端将网页转化为PDF文件

问题

代码

公告

搜索

常用链接

最新随笔

我的标签

随笔分类

随笔档案

阅读排行榜

	from playwright.sync_api import sync_playwright, Playwright
	import argparse
	from typing import Dict

	def run(playwright: Playwright, url: str, path: str, timeout: int, cookie_str: str = None):
	chromium = playwright.chromium
	browser = chromium.launch()
	context = browser.new_context()

	if cookie_str is not None:
	# 获取域名中的域名
	demain = get_address_domain(url)

	for key, value in convert_str_cookie_to_dict(cookie_str).items():
	context.add_cookies([{
	'name': key,
	'value': value,
	'domain': demain,
	'path': "/"
	}])

	page = context.new_page()
	page.goto(url=url, timeout=timeout)
	page.emulate_media(media="print")
	page.pdf(path=path, format="A4", outline=True, margin=dict(top="35px", right="35px", bottom="35px", left="35px"))
	browser.close()

	def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
	cookie_dict: Dict[str, str] = dict()
	if not cookie_str:
	return cookie_dict
	for cookie in cookie_str.split(";"):
	cookie = cookie.strip()
	if not cookie:
	continue
	cookie_list = cookie.split("=")
	if len(cookie_list) != 2:
	continue
	cookie_value = cookie_list[1]
	if isinstance(cookie_value, list):
	cookie_value = "".join(cookie_value)
	cookie_dict[cookie_list[0]] = cookie_value
	return cookie_dict

	def get_address_domain(url: str) -> str:
	if not url:
	return ""
	if url.startswith("http://"):
	url = url.replace("http://", "")
	if url.startswith("https://"):
	url = url.replace("https://", "")
	return url.split("/")[0]

	with sync_playwright() as playwright:
	parser = argparse.ArgumentParser(description='Convert PDF')
	parser.add_argument('-u', '--url', type=str, required = True, help='Need to convert PDF file network address')
	parser.add_argument('-p', '--path', type=str, required = True, help='save file path')
	parser.add_argument('-t', '--timeout', type=int, help='timeout(Unit millisecond), defualt 30000 ', default=30000)
	parser.add_argument('-c', '--cookie', type=str, help='cookie string，For maintaining logins . eg: key1=value1; key2=value2; key3=value3')
	args = parser.parse_args()

	if args.timeout < 1000:
	print("error: Please enter the correct timeout period in milliseconds.")
	exit(0)

	run(playwright, url=args.url, path=args.path, timeout=args.timeout, cookie_str=args.cookie)