python转换html到pdf文件

1.安装wkhtmltopdf 

Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”

2.安装pdfkit

直接pip install pdfkit

pdfkit 是 wkhtmltopdf 的Python封装包

1 import pdfkit
2 
3 # 有下面3中途径生产pdf
4 
5 pdfkit.from_url('http://google.com', 'out.pdf')
6 
7 pdfkit.from_file('test.html', 'out.pdf')
8 
9 pdfkit.from_string('Hello!', 'out.pdf')

3.合并pdf,使用PyPDF2

直接pip install PyPDF2

1 from PyPDF2 import PdfFileMerger
2 merger = PdfFileMerger()
3 input1 = open("1.pdf", "rb")
4 input2 = open("2.pdf", "rb")
5 merger.append(input1)
6 merger.append(input2)
7 # 写入到输出pdf文档中
8 output = open("hql_all.pdf", "wb")
9 merger.write(output)

4.综合示例:

  1 # coding=utf-8  
  2 import os  
  3 import re  
  4 import time  
  5 import logging  
  6 import pdfkit  
  7 import requests  
  8 from bs4 import BeautifulSoup  
  9 from PyPDF2 import PdfFileMerger  
 10 
 11 html_template = """ 
 12 <!DOCTYPE html> 
 13 <html lang="en"> 
 14 <head> 
 15     <meta charset="UTF-8"> 
 16 </head> 
 17 <body> 
 18 {content} 
 19 </body> 
 20 </html> 
 21 
 22 """  
 23 
 24 
 25 def parse_url_to_html(url, name):  
 26     """ 
 27     解析URL,返回HTML内容 
 28     :param url:解析的url 
 29     :param name: 保存的html文件名 
 30     :return: html 
 31     """  
 32     try:  
 33         response = requests.get(url)  
 34         soup = BeautifulSoup(response.content, 'html.parser')  
 35         # 正文  
 36         body = soup.find_all(class_="x-wiki-content")[0]  
 37         # 标题  
 38         title = soup.find('h4').get_text()  
 39 
 40         # 标题加入到正文的最前面,居中显示  
 41         center_tag = soup.new_tag("center")  
 42         title_tag = soup.new_tag('h1')  
 43         title_tag.string = title  
 44         center_tag.insert(1, title_tag)  
 45         body.insert(1, center_tag)  
 46         html = str(body)  
 47         # body中的img标签的src相对路径的改成绝对路径  
 48         pattern = "(<img .*?src=\")(.*?)(\")"  
 49 
 50         def func(m):  
 51             if not m.group(3).startswith("http"):  
 52                 rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3)  
 53                 return rtn  
 54             else:  
 55                 return m.group(1)+m.group(2)+m.group(3)  
 56         html = re.compile(pattern).sub(func, html)  
 57         html = html_template.format(content=html)  
 58         html = html.encode("utf-8")  
 59         with open(name, 'wb') as f:  
 60             f.write(html)  
 61         return name  
 62 
 63     except Exception as e:  
 64 
 65         logging.error("解析错误", exc_info=True)  
 66 
 67 
 68 def get_url_list():  
 69     """ 
 70     获取所有URL目录列表 
 71     :return: 
 72     """  
 73     response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")  
 74     soup = BeautifulSoup(response.content, "html.parser")  
 75     menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]  
 76     urls = []  
 77     for li in menu_tag.find_all("li"):  
 78         url = "http://www.liaoxuefeng.com" + li.a.get('href')  
 79         urls.append(url)  
 80     return urls  
 81 
 82 
 83 def save_pdf(htmls, file_name):  
 84     """ 
 85     把所有html文件保存到pdf文件 
 86     :param htmls:  html文件列表 
 87     :param file_name: pdf文件名 
 88     :return: 
 89     """  
 90     options = {  
 91         'page-size': 'Letter',  
 92         'margin-top': '0.75in',  
 93         'margin-right': '0.75in',  
 94         'margin-bottom': '0.75in',  
 95         'margin-left': '0.75in',  
 96         'encoding': "UTF-8",  
 97         'custom-header': [  
 98             ('Accept-Encoding', 'gzip')  
 99         ],  
100         'cookie': [  
101             ('cookie-name1', 'cookie-value1'),  
102             ('cookie-name2', 'cookie-value2'),  
103         ],  
104         'outline-depth': 10,  
105     }  
106     pdfkit.from_file(htmls, file_name, options=options)  
107 
108 
109 def main():  
110     start = time.time()  
111     file_name = u"liaoxuefeng_Python3_tutorial"  
112     urls = get_url_list()  
113     for index, url in enumerate(urls):  
114       parse_url_to_html(url, str(index) + ".html")  
115     htmls =[]  
116     pdfs =[]  
117     for i in range(0,124):  
118         htmls.append(str(i)+'.html')  
119         pdfs.append(file_name+str(i)+'.pdf')  
120 
121         save_pdf(str(i)+'.html', file_name+str(i)+'.pdf')  
122 
123         print u"转换完成第"+str(i)+'个html'  
124 
125     merger = PdfFileMerger()  
126     for pdf in pdfs:  
127        merger.append(open(pdf,'rb'))  
128        print u"合并完成第"+str(i)+'个pdf'+pdf  
129 
130     output = open(u"廖雪峰Python_all.pdf", "wb")  
131     merger.write(output)  
132 
133     print u"输出PDF成功!"  
134 
135     for html in htmls:  
136         os.remove(html)  
137         print u"删除临时文件"+html  
138 
139     for pdf in pdfs:  
140         os.remove(pdf)  
141         print u"删除临时文件"+pdf  
142 
143     total_time = time.time() - start  
144     print(u"总共耗时:%f 秒" % total_time)  
145 
146 
147 if __name__ == '__main__':  
148     main()  

 

posted on 2018-01-15 10:14  帅胡  阅读(3977)  评论(0编辑  收藏  举报

导航