Python处理PDF

reference: How to Work With a PDF in Python
reference: 给PDF添加水印

本文使用的PDF处理库为pypdf2

Read info

从一个pdf复制页面到另一个页面可以一页一页读写。

from PyPDF2 import pdfFileReader, pdfFileWriter

def extract_information(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
		# multiply 0.352 to convert inches to millimeters
        height = float(pdf.getPage(0).mediaBox.getHeight()) * 0.352
        width = float(pdf.getPage(0).mediaBox.getWidth()) * 0.352

    txt = f"""
    Information about {pdf_path}:

    Author: {information.author}
    Creator: {information.creator}
    Producer: {information.producer}
    Subject: {information.subject}
    Title: {information.title}
    Number of pages: {number_of_pages}
    Height: {height:.2f}
    Width: {width:.2f}
    """

    print(txt)

Modify info

从一个pdf复制页面到另一个页面也可以一次性读写。

def modify_doc_info(pdf_path, author, creator, producer, subject, title):
    pdf = PdfFileReader(pdf_path)
    pdf_writer = PdfFileWriter()
    pdf_writer.addMetadata({
        '/Author': author,
        '/Creator': creator,
        '/Producer': producer,
        '/Subject': subject,
        '/Title': title
    })
    output = pdf_path.replace('.pdf', '-modified.pdf')
	# add all pages without modification
    pdf_writer.appendPagesFromReader(pdf)
    pdf_writer.write(open(output, 'wb'))

Rotate Page


def rotate_pages(pdf_path):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(pdf_path)
    # Rotate page 90 degrees to the right
    page_1 = pdf_reader.getPage(0).rotateClockwise(90)
    pdf_writer.addPage(page_1)
    # Rotate page 90 degrees to the left
    page_2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
    pdf_writer.addPage(page_2)
    # Add a page in normal orientation
    pdf_writer.addPage(pdf_reader.getPage(2))

    with open('rotate_pages.pdf', 'wb') as fh:
        pdf_writer.write(fh)

Merge PDFs

def merge_pdfs(paths : list, output : str):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            # Add each page to the writer object
            pdf_writer.addPage(pdf_reader.getPage(page))

    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)

Split PDFs

def split(path, name_of_split):
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))

        output = f'{name_of_split}{page}.pdf'
        with open(output, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

Encrypt a PDF

def add_encryption(input_pdf, output_pdf, password):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(input_pdf)

    for page in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(page))
	# or use pdf_writer.appendPagesFromReader(pdf_reader)
    pdf_writer.encrypt(user_pwd=password, owner_pwd=None, 
                       use_128bit=True)

    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)

Decrypt a PDF

def decrypt_pdf(input_pdf, output_pdf, password):
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(input_pdf)

    if pdf_reader.isEncrypted:
        try:
            pdf_reader.decrypt(password)
        except:
            print("Wrong password")
    else:
        print("File is not encrypted")

    for page in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(page))

    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)

Add watermark

使用word制作水印页面。
设计->水印->自定义水印,导出PDF作为watermark.pdf

def create_watermark(input_pdf, output, watermark):
    watermark_obj = PdfFileReader(watermark)
    watermark_page = watermark_obj.getPage(0)

    pdf_reader = PdfFileReader(input_pdf)
    pdf_writer = PdfFileWriter()
    # multiply 0.352 to convert inches to millimeters
    print(f"watermask height: {0.352 * float(watermark_page.mediaBox.getHeight()):.2f}, \
            watermask width: {0.352 * float(watermark_page.mediaBox.getWidth()):.2f}")
    # Watermark all the pages
    for page in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page)
        print(f"page height: {0.352 * float(page.mediaBox.getHeight()):.2f}, \
                page width: {0.352 * float(page.mediaBox.getWidth()):.2f}")
        page.mergePage(watermark_page)
        pdf_writer.addPage(page)

    with open(output, 'wb') as out:
        pdf_writer.write(out)

Images to PDF

本模块需要使用Pillow,对pillow不熟悉的可以看看这一篇

输入两个列表,分别是图片路径和输出路径。其中为了使PDF好看一点,做了一次resize。同时假设了图片尺寸有两种:宽大于高的,宽小于高的,分类做成两个PDF。以后根据需要,可以做更多尺寸的。

此外值得一提的是,为了避免无效的图片路径污染代码,首先会尝试读一遍,把无效路径删去,这里有个技巧是反着读,避免因为删除了某个元素少读了其它元素。

def img2pdf(img_paths : list, outputs : list):
    assert type(img_paths) == list, "img_paths must be a list"
    assert type(outputs) == list, "outputs must be a list"
    pdf_writer1 = PdfFileWriter() # process images whose height is larger than width
    pdf_writer2 = PdfFileWriter() # process images whose width is larger than height
    
    fixed_height = 0
    fixed_width = 0
    for img_path in img_paths[::-1]:
        try:
            img = Image.open(img_path)
        except:
            img_paths.remove(img_path)
            print(f"Error: {img_path} is not a valid image path")
        else:
            if img.height > img.width:
                fixed_height = max(fixed_height, img.height)
            else:
                fixed_width = max(fixed_width, img.width)
    print(f"fixed_height: {fixed_height}, fixed_width: {fixed_width}")

    for img_path in img_paths:
        img = Image.open(img_path)
        if img.height > img.width:
            target_width = int(img.width * fixed_height / img.height)
            img_resize = img.resize((target_width  , fixed_height), Image.ANTIALIAS)
            img_resize.save('temp1.pdf')
            temp_reader = PdfFileReader('temp1.pdf')
            pdf_writer1.addPage(temp_reader.getPage(0))
        else:
            target_height = int(img.height * fixed_width / img.width)
            img_resize = img.resize((fixed_width, target_height), Image.ANTIALIAS)
            img_resize.save('temp2.pdf')
            temp_reader = PdfFileReader('temp2.pdf')
            pdf_writer2.addPage(temp_reader.getPage(0))
    print(f"pdf_writer1: {pdf_writer1.getNumPages()}, pdf_writer2: {pdf_writer2.getNumPages()}")
    with open(outputs[0], 'wb') as f:
        pdf_writer1.write(f)
    with open(outputs[1], 'wb') as f:
        pdf_writer2.write(f)
    os.remove('temp1.pdf')
    os.remove('temp2.pdf')

Notebooks to PDF

我们常常需要把jupyter notebook转为PDF,这诚然可以手动用jupyter的功能进行转换,但是对于大量的notebook文件,使用脚本会更加高效。
note: 要进行转换,需要安装jupyter-contrib-nbextensions等相关拓展。

from PyPDF2 import PdfFileMerger
import subprocess
import os

def generate_pdf(nb_paths, pdf_path="merged.pdf"):
    os_args = [
        'jupyter', 'nbconvert', '--to', 'pdf'
    ]
    for nb_path in nb_paths:
        os_args.append(nb_path)
        subprocess.run(os_args)
        os_args.pop()
        print(f'Generated {nb_path.replace(".ipynb", ".pdf")}')
    pdfs = [nb_path.replace(".ipynb", ".pdf") for nb_path in nb_paths]
    merger = PdfFileMerger()
    for pdf in pdfs:
        merger.append(pdf)
    merger.write(pdf_path)
    merger.close()
    for pdf in pdfs:
        os.remove(pdf)
posted @ 2022-12-03 22:31  Cisco_coco  阅读(163)  评论(0编辑  收藏  举报