使用playwright控制浏览器在服务器端将网页转化为PDF文件

问题

在服务端常有需要对页面进行转PDF文件

代码

requirements.txt

点击查看代码
playwright

convert_pdf.py

点击查看代码
from playwright.sync_api import sync_playwright, Playwright
import argparse
from typing import Dict
def run(playwright: Playwright, url: str, path: str, timeout: int, cookie_str: str = None):
chromium = playwright.chromium
browser = chromium.launch()
context = browser.new_context()
if cookie_str is not None:
# 获取域名中的域名
demain = get_address_domain(url)
for key, value in convert_str_cookie_to_dict(cookie_str).items():
context.add_cookies([{
'name': key,
'value': value,
'domain': demain,
'path': "/"
}])
page = context.new_page()
page.goto(url=url, timeout=timeout)
page.emulate_media(media="print")
page.pdf(path=path, format="A4", outline=True, margin=dict(top="35px", right="35px", bottom="35px", left="35px"))
browser.close()
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
cookie_dict: Dict[str, str] = dict()
if not cookie_str:
return cookie_dict
for cookie in cookie_str.split(";"):
cookie = cookie.strip()
if not cookie:
continue
cookie_list = cookie.split("=")
if len(cookie_list) != 2:
continue
cookie_value = cookie_list[1]
if isinstance(cookie_value, list):
cookie_value = "".join(cookie_value)
cookie_dict[cookie_list[0]] = cookie_value
return cookie_dict
def get_address_domain(url: str) -> str:
if not url:
return ""
if url.startswith("http://"):
url = url.replace("http://", "")
if url.startswith("https://"):
url = url.replace("https://", "")
return url.split("/")[0]
with sync_playwright() as playwright:
parser = argparse.ArgumentParser(description='Convert PDF')
parser.add_argument('-u', '--url', type=str, required = True, help='Need to convert PDF file network address')
parser.add_argument('-p', '--path', type=str, required = True, help='save file path')
parser.add_argument('-t', '--timeout', type=int, help='timeout(Unit millisecond), defualt 30000 ', default=30000)
parser.add_argument('-c', '--cookie', type=str, help='cookie string,For maintaining logins . eg: key1=value1; key2=value2; key3=value3')
args = parser.parse_args()
if args.timeout < 1000:
print("error: Please enter the correct timeout period in milliseconds.")
exit(0)
run(playwright, url=args.url, path=args.path, timeout=args.timeout, cookie_str=args.cookie)
使用以下命令安装依赖

python install -r requirements.txt

playwright install

等待安装完成后,在使用下列命令转化即可
python .\convert_pdf.py -u https://www.baidu.com --path ./page8.pdf

posted @   east_ebony  阅读(204)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· Deepseek官网太卡,教你白嫖阿里云的Deepseek-R1满血版
· 2分钟学会 DeepSeek API,竟然比官方更好用!
· .NET 使用 DeepSeek R1 开发智能 AI 客户端
· DeepSeek本地性能调优
· 一文掌握DeepSeek本地部署+Page Assist浏览器插件+C#接口调用+局域网访问!全攻略
点击右上角即可分享
微信分享提示