使用playwright控制浏览器在服务器端将网页转化为PDF文件
问题
在服务端常有需要对页面进行转PDF文件
代码
requirements.txt
点击查看代码
playwright
convert_pdf.py
点击查看代码
from playwright.sync_api import sync_playwright, Playwright import argparse from typing import Dict def run(playwright: Playwright, url: str, path: str, timeout: int, cookie_str: str = None): chromium = playwright.chromium browser = chromium.launch() context = browser.new_context() if cookie_str is not None: # 获取域名中的域名 demain = get_address_domain(url) for key, value in convert_str_cookie_to_dict(cookie_str).items(): context.add_cookies([{ 'name': key, 'value': value, 'domain': demain, 'path': "/" }]) page = context.new_page() page.goto(url=url, timeout=timeout) page.emulate_media(media="print") page.pdf(path=path, format="A4", outline=True, margin=dict(top="35px", right="35px", bottom="35px", left="35px")) browser.close() def convert_str_cookie_to_dict(cookie_str: str) -> Dict: cookie_dict: Dict[str, str] = dict() if not cookie_str: return cookie_dict for cookie in cookie_str.split(";"): cookie = cookie.strip() if not cookie: continue cookie_list = cookie.split("=") if len(cookie_list) != 2: continue cookie_value = cookie_list[1] if isinstance(cookie_value, list): cookie_value = "".join(cookie_value) cookie_dict[cookie_list[0]] = cookie_value return cookie_dict def get_address_domain(url: str) -> str: if not url: return "" if url.startswith("http://"): url = url.replace("http://", "") if url.startswith("https://"): url = url.replace("https://", "") return url.split("/")[0] with sync_playwright() as playwright: parser = argparse.ArgumentParser(description='Convert PDF') parser.add_argument('-u', '--url', type=str, required = True, help='Need to convert PDF file network address') parser.add_argument('-p', '--path', type=str, required = True, help='save file path') parser.add_argument('-t', '--timeout', type=int, help='timeout(Unit millisecond), defualt 30000 ', default=30000) parser.add_argument('-c', '--cookie', type=str, help='cookie string,For maintaining logins . eg: key1=value1; key2=value2; key3=value3') args = parser.parse_args() if args.timeout < 1000: print("error: Please enter the correct timeout period in milliseconds.") exit(0) run(playwright, url=args.url, path=args.path, timeout=args.timeout, cookie_str=args.cookie)
python install -r requirements.txt
playwright install
等待安装完成后,在使用下列命令转化即可
python .\convert_pdf.py -u https://www.baidu.com --path ./page8.pdf
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Deepseek官网太卡,教你白嫖阿里云的Deepseek-R1满血版
· 2分钟学会 DeepSeek API,竟然比官方更好用!
· .NET 使用 DeepSeek R1 开发智能 AI 客户端
· DeepSeek本地性能调优
· 一文掌握DeepSeek本地部署+Page Assist浏览器插件+C#接口调用+局域网访问!全攻略