Python处理PDF
目录
reference: How to Work With a PDF in Python
reference: 给PDF添加水印
本文使用的PDF处理库为pypdf2
Read info
从一个pdf复制页面到另一个页面可以一页一页读写。
from PyPDF2 import pdfFileReader, pdfFileWriter
def extract_information(pdf_path):
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
# multiply 0.352 to convert inches to millimeters
height = float(pdf.getPage(0).mediaBox.getHeight()) * 0.352
width = float(pdf.getPage(0).mediaBox.getWidth()) * 0.352
txt = f"""
Information about {pdf_path}:
Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}
Height: {height:.2f}
Width: {width:.2f}
"""
print(txt)
Modify info
从一个pdf复制页面到另一个页面也可以一次性读写。
def modify_doc_info(pdf_path, author, creator, producer, subject, title):
pdf = PdfFileReader(pdf_path)
pdf_writer = PdfFileWriter()
pdf_writer.addMetadata({
'/Author': author,
'/Creator': creator,
'/Producer': producer,
'/Subject': subject,
'/Title': title
})
output = pdf_path.replace('.pdf', '-modified.pdf')
# add all pages without modification
pdf_writer.appendPagesFromReader(pdf)
pdf_writer.write(open(output, 'wb'))
Rotate Page
def rotate_pages(pdf_path):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(pdf_path)
# Rotate page 90 degrees to the right
page_1 = pdf_reader.getPage(0).rotateClockwise(90)
pdf_writer.addPage(page_1)
# Rotate page 90 degrees to the left
page_2 = pdf_reader.getPage(1).rotateCounterClockwise(90)
pdf_writer.addPage(page_2)
# Add a page in normal orientation
pdf_writer.addPage(pdf_reader.getPage(2))
with open('rotate_pages.pdf', 'wb') as fh:
pdf_writer.write(fh)
Merge PDFs
def merge_pdfs(paths : list, output : str):
pdf_writer = PdfFileWriter()
for path in paths:
pdf_reader = PdfFileReader(path)
for page in range(pdf_reader.getNumPages()):
# Add each page to the writer object
pdf_writer.addPage(pdf_reader.getPage(page))
# Write out the merged PDF
with open(output, 'wb') as out:
pdf_writer.write(out)
Split PDFs
def split(path, name_of_split):
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output = f'{name_of_split}{page}.pdf'
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
Encrypt a PDF
def add_encryption(input_pdf, output_pdf, password):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(input_pdf)
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
# or use pdf_writer.appendPagesFromReader(pdf_reader)
pdf_writer.encrypt(user_pwd=password, owner_pwd=None,
use_128bit=True)
with open(output_pdf, 'wb') as fh:
pdf_writer.write(fh)
Decrypt a PDF
def decrypt_pdf(input_pdf, output_pdf, password):
pdf_writer = PdfFileWriter()
pdf_reader = PdfFileReader(input_pdf)
if pdf_reader.isEncrypted:
try:
pdf_reader.decrypt(password)
except:
print("Wrong password")
else:
print("File is not encrypted")
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
with open(output_pdf, 'wb') as fh:
pdf_writer.write(fh)
Add watermark
使用word制作水印页面。
设计->水印->自定义水印,导出PDF作为watermark.pdf
。
def create_watermark(input_pdf, output, watermark):
watermark_obj = PdfFileReader(watermark)
watermark_page = watermark_obj.getPage(0)
pdf_reader = PdfFileReader(input_pdf)
pdf_writer = PdfFileWriter()
# multiply 0.352 to convert inches to millimeters
print(f"watermask height: {0.352 * float(watermark_page.mediaBox.getHeight()):.2f}, \
watermask width: {0.352 * float(watermark_page.mediaBox.getWidth()):.2f}")
# Watermark all the pages
for page in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page)
print(f"page height: {0.352 * float(page.mediaBox.getHeight()):.2f}, \
page width: {0.352 * float(page.mediaBox.getWidth()):.2f}")
page.mergePage(watermark_page)
pdf_writer.addPage(page)
with open(output, 'wb') as out:
pdf_writer.write(out)
Images to PDF
本模块需要使用Pillow,对pillow不熟悉的可以看看这一篇
输入两个列表,分别是图片路径和输出路径。其中为了使PDF好看一点,做了一次resize。同时假设了图片尺寸有两种:宽大于高的,宽小于高的,分类做成两个PDF。以后根据需要,可以做更多尺寸的。
此外值得一提的是,为了避免无效的图片路径污染代码,首先会尝试读一遍,把无效路径删去,这里有个技巧是反着读,避免因为删除了某个元素少读了其它元素。
def img2pdf(img_paths : list, outputs : list):
assert type(img_paths) == list, "img_paths must be a list"
assert type(outputs) == list, "outputs must be a list"
pdf_writer1 = PdfFileWriter() # process images whose height is larger than width
pdf_writer2 = PdfFileWriter() # process images whose width is larger than height
fixed_height = 0
fixed_width = 0
for img_path in img_paths[::-1]:
try:
img = Image.open(img_path)
except:
img_paths.remove(img_path)
print(f"Error: {img_path} is not a valid image path")
else:
if img.height > img.width:
fixed_height = max(fixed_height, img.height)
else:
fixed_width = max(fixed_width, img.width)
print(f"fixed_height: {fixed_height}, fixed_width: {fixed_width}")
for img_path in img_paths:
img = Image.open(img_path)
if img.height > img.width:
target_width = int(img.width * fixed_height / img.height)
img_resize = img.resize((target_width , fixed_height), Image.ANTIALIAS)
img_resize.save('temp1.pdf')
temp_reader = PdfFileReader('temp1.pdf')
pdf_writer1.addPage(temp_reader.getPage(0))
else:
target_height = int(img.height * fixed_width / img.width)
img_resize = img.resize((fixed_width, target_height), Image.ANTIALIAS)
img_resize.save('temp2.pdf')
temp_reader = PdfFileReader('temp2.pdf')
pdf_writer2.addPage(temp_reader.getPage(0))
print(f"pdf_writer1: {pdf_writer1.getNumPages()}, pdf_writer2: {pdf_writer2.getNumPages()}")
with open(outputs[0], 'wb') as f:
pdf_writer1.write(f)
with open(outputs[1], 'wb') as f:
pdf_writer2.write(f)
os.remove('temp1.pdf')
os.remove('temp2.pdf')
Notebooks to PDF
我们常常需要把jupyter notebook转为PDF,这诚然可以手动用jupyter的功能进行转换,但是对于大量的notebook文件,使用脚本会更加高效。
note: 要进行转换,需要安装jupyter-contrib-nbextensions等相关拓展。
from PyPDF2 import PdfFileMerger
import subprocess
import os
def generate_pdf(nb_paths, pdf_path="merged.pdf"):
os_args = [
'jupyter', 'nbconvert', '--to', 'pdf'
]
for nb_path in nb_paths:
os_args.append(nb_path)
subprocess.run(os_args)
os_args.pop()
print(f'Generated {nb_path.replace(".ipynb", ".pdf")}')
pdfs = [nb_path.replace(".ipynb", ".pdf") for nb_path in nb_paths]
merger = PdfFileMerger()
for pdf in pdfs:
merger.append(pdf)
merger.write(pdf_path)
merger.close()
for pdf in pdfs:
os.remove(pdf)