利用python的PyPDF2和PyMuPDF库玩转PDF的提取、合并、旋转、缩放、加密
一、安装PyPDF2和PyMuPDF库
pip install PyPDF2
pip install pymupdf # fitz是pymupdf的子模块
二、工具类代码
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfFileMerger
import fitz
import re
import os
'''
根据页码列表从pdf中取出指定的页
pdf_input_path: 原pdf路径
pdf_output_path: 输出pdf路径
page_no_list: 提取页面列表
rotate_angle: 旋转角度
'''
def pdf_separate_by_page_no_list(pdf_input_path, pdf_output_path, page_no_list, rotate_angle=0):
# 初始化一个pdf
output = PdfFileWriter()
# 读取pdf
with open(pdf_input_path, 'rb') as in_pdf:
pdf_file = PdfFileReader(in_pdf)
# 从pdf中取出指定页
for page_no in page_no_list:
output.addPage(pdf_file.getPage(page_no - 1).rotateClockwise(rotate_angle))
# 写出pdf
with open(pdf_output_path, 'ab') as out_pdf:
output.write(out_pdf)
'''
根据起止页码范围(包含开头包含结尾)从pdf中取出所需页
pdf_input_path: 原pdf路径
pdf_output_path: 输出pdf路径
start_page_no: 开始页码
end_page_no: 结束页码
rotate_angle: 旋转角度
'''
def pdf_separate_from_start_to_end(pdf_input_path, pdf_output_path, start_page_no, end_page_no, rotate_angle=0):
# 初始化一个pdf
output = PdfFileWriter()
# 读取pdf
with open(pdf_input_path, 'rb') as in_pdf:
pdf_file = PdfFileReader(in_pdf)
# 从pdf中取出指定页
for i in range(start_page_no - 1, end_page_no):
output.addPage(pdf_file.getPage(i).rotateClockwise(rotate_angle))
# 写出pdf
with open(pdf_output_path, 'ab') as out_pdf:
output.write(out_pdf)
'''
通过多个pdf路径列表,将多个pdf合并
pdf_path_list: 合并pdf路径列表
pdf_output_path: 输出pdf路径
'''
def pdf_merge_from_paths(pdf_path_list, pdf_output_path):
# 初始化一个PDF
file_merger = PdfFileMerger()
# 合并
for pdf in pdf_path_list:
file_merger.append(pdf, import_bookmarks=False) # 合并pdf文件
file_merger.write(pdf_output_path)
'''
通过多个pdf所在父目录,将多个pdf合并
pdf_dir: 合并pdf文件所在父目录
pdf_output_path: 输出pdf路径
'''
def pdf_merge_from_dir(pdf_dir, pdf_output_path):
# 初始化一个PDF
file_merger = PdfFileMerger()
# 读取PDF
pdf_lst = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
pdf_lst = [os.path.join(pdf_dir, filename) for filename in pdf_lst]
# 合并
for pdf in pdf_lst:
file_merger.append(pdf) # 合并pdf文件
file_merger.write(pdf_output_path)
'''
将指定页移动至目标页的前面或后面,直接修改原pdf
pdf_path: 修改pdf路径
aim_page_no: 目标页页码
move_page_no_list: 移动页页码列表
is_behind: 是否移动至目标页的后面
'''
def pdf_move_page_no_list(pdf_path, aim_page_no, move_page_no_list, is_behind=True):
# 初始化一个pdf
output = PdfFileWriter()
with open(pdf_path, 'rb') as in_pdf:
pdf_file = PdfFileReader(in_pdf)
delta = 0 if is_behind else 1
# 第1部分
for i in range(0, aim_page_no - delta):
output.addPage(pdf_file.getPage(i))
# 第2部分:移动页
for i in move_page_no_list:
output.addPage(pdf_file.getPage(i - 1))
# 第3部分:剩下的
for i in range(aim_page_no - delta, len(pdf_file.pages)):
if i + 1 not in move_page_no_list:
output.addPage(pdf_file.getPage(i))
# 写出pdf
with open(pdf_path, 'ab') as out_pdf:
output.write(out_pdf)
'''
将指定页旋转一定角度,直接修改原pdf
pdf_path: 修改pdf路径
rotate_page_no_list: 旋转页页码列表
rotate_angle: 旋转角度
'''
def pdf_rotate_page_no_list(pdf_path, rotate_page_no_list, rotate_angle=0):
# 初始化一个pdf
output = PdfFileWriter()
# 读取pdf
with open(pdf_path, 'rb') as in_pdf:
pdf_file = PdfFileReader(in_pdf)
# 遍历所有页
for i in range(len(pdf_file.pages)):
# 需要旋转页按照规定角度旋转,不需要旋转的页设定旋转角度为0
angle = 0
if i + 1 in rotate_page_no_list:
angle = rotate_angle
output.addPage(pdf_file.getPage(i).rotateClockwise(angle))
# 写出pdf
with open(pdf_path, 'ab') as out_pdf:
output.write(out_pdf)
'''
将pdf转为图片
'''
def covert2pic(doc, page_count, zoom):
if os.path.exists('.pdf'):
os.removedirs('.pdf')
os.mkdir('.pdf')
for pg in range(page_count):
page = doc.load_page(pg)
img_path = '.pdf/%s.png' % str(pg + 1)
trans = fitz.Matrix(int(zoom) / 100.0, int(zoom) / 100.0)
pm = page.get_pixmap(matrix=trans, alpha=False)
pm.save(img_path)
print(page)
doc.close()
'''
将图片转pdf
'''
def pic2pdf(pdf_output_path, page_count):
doc = fitz.open()
for pg in range(page_count):
img_path = '.pdf/%s.png' % str(pg + 1)
img_doc = fitz.open(img_path)
pdf_bytes = img_doc.convert_to_pdf()
img_pdf = fitz.open("pdf", pdf_bytes)
os.remove(img_path)
doc.insert_pdf(img_pdf)
if os.path.exists(pdf_output_path):
os.remove(pdf_output_path)
doc.save(pdf_output_path)
doc.close()
'''
对pdf大小进行缩放, 将原pdf转化为图片类型
pdf_input_path: 原pdf路径
pdf_output_path: 输出pdf路径
zoom: 缩放比例, 80, 100, 120等
'''
def pdf_zoom(pdf_input_path, pdf_output_path, zoom):
doc = fitz.open(pdf_input_path)
page_count = doc.page_count
covert2pic(doc, page_count, zoom)
pic2pdf(pdf_output_path, page_count)
os.removedirs('.pdf')
'''
对pdf进行加密
pdf_input_path: 原pdf路径
pdf_output_path: 输出pdf路径
password: 加密密码
'''
def pdf_encrypt(pdf_input_path, pdf_output_path, password):
# 初始化一个pdf
output = PdfFileWriter()
output.encrypt(password)
# 读取pdf
with open(pdf_input_path, 'rb') as in_pdf:
pdf_file = PdfFileReader(in_pdf)
# 遍历所有页
for page in pdf_file.pages:
output.addPage(page)
# 写出pdf
with open(pdf_output_path, 'ab') as out_pdf:
output.write(out_pdf)
God will send the rain when you are ready.You need to prepare your field to receive it.