python3爬小说然后转为PDF,用于手机小说阅读器观看

# -*- coding: utf-8 -*-
#1,首先导入库
import requests
from bs4 import BeautifulSoup
import pdfkit
import lxml
import lxml.etree
import os
import os.path
from PyPDF2 import PdfFileReader, PdfFileWriter


#2.获取url列表
def get_url_list(url):
url_list=[]
body=requests.get(url)
html=body.content.decode("gbk")
soup=BeautifulSoup(html,"lxml")
links=soup.select('a')
for tag in links:
if tag.get_text().startswith("第"):

url_list.append(url2+tag.get_attribute_list("href")[0])
return url_list

#3,获取需要的文本和标题
def get_content(url):
"""
解析URL,获取需要的html内容
:param url: 目标网址
:return: html
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html5lib")
title = soup.select("#h1")[0]
content = soup.select("#content")[0]
content = str(content).replace("****", "")
html = html_template.format(content=content, title=title)

return html

#4,保存每一个到一个PDF
def save_pdf(html, filename):
"""
把所有html文件保存到pdf文件
:param html: html内容
:param file_name: pdf文件名
:return:
"""
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,

}
# 下载wkhtmktopdf转换软件,添加到环境变量只能在用户下执行,并添加执行路径,只有这样
path_wk = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' # 安装位置
config = pdfkit.configuration(wkhtmltopdf=path_wk)
pdfkit.from_string(html, filename, options=options, configuration=config)

#5,获取同一个文件夹下的所有PDF文件名
def getFileName(filepath):
file_list = []
for root, dirs, files in os.walk(filepath):
for filespath in files:
# print(os.path.join(root,filespath))
file_list.append(os.path.join(root, filespath))

return file_list


#6,合并同一个文件夹下所有PDF文件
def MergePDF(filepath, outfile):
output = PdfFileWriter()
outputPages = 0
pdf_fileName = getFileName(filepath)
for each in pdf_fileName:
print(each)
# 读取源pdf文件
input = PdfFileReader(open(each, "rb"))

# 如果pdf文件已经加密,必须首先解密才能使用pyPdf
if input.isEncrypted == True:
input.decrypt("map")

# 获得源pdf文件中页面总数
pageCount = input.getNumPages()
outputPages += pageCount
print(pageCount)

# 分别将page添加到输出output中
for iPage in range(0, pageCount):
output.addPage(input.getPage(iPage))

print("All Pages Number:" + str(outputPages))
# 最后写pdf文件
outputStream = open(filepath + outfile, "wb")
output.write(outputStream)
outputStream.close()
print("finished")



if __name__ == '__main__':
url = 'http://www.'
url2= 'http://www/'
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{title}
{content}
</body>
</html>
"""
for k,url in enumerate(get_url_list(url)):
print(url)
html=get_content(url)
print(html)
save_pdf(html,str(k)+r".pdf")
file_dir = r'C:\Users\Administrator\Desktop\dailyScript'
out = "总裁.pdf"
MergePDF(file_dir, out)
posted @ 2018-12-19 13:29  hyolyn  阅读(493)  评论(0编辑  收藏  举报