python3爬小说然后转为PDF，用于手机小说阅读器观看

# -*- coding: utf-8 -*-
#1，首先导入库
import requests
from bs4 import BeautifulSoup
import pdfkit
import lxml
import lxml.etree
import os
import os.path
from PyPDF2 import PdfFileReader, PdfFileWriter


#2.获取url列表
def get_url_list(url):
    url_list=[]
    body=requests.get(url)
    html=body.content.decode("gbk")
    soup=BeautifulSoup(html,"lxml")
    links=soup.select('a')
    for tag in links:
        if tag.get_text().startswith("第"):

            url_list.append(url2+tag.get_attribute_list("href")[0])
    return url_list

#3，获取需要的文本和标题
def get_content(url):
    """
    解析URL，获取需要的html内容
    :param url: 目标网址
    :return: html
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html5lib")
    title = soup.select("#h1")[0]
    content = soup.select("#content")[0]
    content = str(content).replace("****", "")
    html = html_template.format(content=content, title=title)

    return html

#4，保存每一个到一个PDF
def save_pdf(html, filename):
    """
    把所有html文件保存到pdf文件
    :param html:  html内容
    :param file_name: pdf文件名
    :return:
    """
    options = {
        'page-size': 'Letter',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ],
        'cookie': [
            ('cookie-name1', 'cookie-value1'),
            ('cookie-name2', 'cookie-value2'),
        ],
        'outline-depth': 10,
        
    }
    # 下载wkhtmktopdf转换软件，添加到环境变量只能在用户下执行，并添加执行路径，只有这样
    path_wk = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'  # 安装位置
    config = pdfkit.configuration(wkhtmltopdf=path_wk)
    pdfkit.from_string(html, filename, options=options, configuration=config)

#5，获取同一个文件夹下的所有PDF文件名
def getFileName(filepath):
    file_list = []
    for root, dirs, files in os.walk(filepath):
        for filespath in files:
            # print(os.path.join(root,filespath))
            file_list.append(os.path.join(root, filespath))

    return file_list


#6，合并同一个文件夹下所有PDF文件
def MergePDF(filepath, outfile):
    output = PdfFileWriter()
    outputPages = 0
    pdf_fileName = getFileName(filepath)
    for each in pdf_fileName:
        print(each)
        # 读取源pdf文件
        input = PdfFileReader(open(each, "rb"))

        # 如果pdf文件已经加密，必须首先解密才能使用pyPdf
        if input.isEncrypted == True:
            input.decrypt("map")

        # 获得源pdf文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print(pageCount)

        # 分别将page添加到输出output中
        for iPage in range(0, pageCount):
            output.addPage(input.getPage(iPage))

    print("All Pages Number:" + str(outputPages))
    # 最后写pdf文件
    outputStream = open(filepath + outfile, "wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")



if __name__ == '__main__':
    url = 'http://www.'
    url2= 'http://www/'
    html_template = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
    </head>
    <body>
    {title}
    {content}
    </body>
    </html>
    """
    for k,url in enumerate(get_url_list(url)):
        print(url)
        html=get_content(url)
        print(html)
        save_pdf(html,str(k)+r".pdf")
    file_dir = r'C:\Users\Administrator\Desktop\dailyScript'
    out = "总裁.pdf"
    MergePDF(file_dir, out)
posted @ 2018-12-19 13:29 hyolyn 阅读(493) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
hyolyn

python3爬小说然后转为PDF，用于手机小说阅读器观看

公告