python - pdf去除水印
可以使用微信截图等工具查看需要删除水印的rgb值,然后根据rgb值去编写判断函数,判断该像素点是否置为白色
pip3 install fitz
pip3 install pillow
pip3 install tqdm
pip3 install pymupdf
1. pdf转为图片并去除水印
import fitz
from itertools import product
import os
from functools import cmp_to_key
from tqdm import trange
# 打开pdf文件
pdf = fitz.open('./test.pdf')
def check_rgb(rgb):
# 判断水印蓝色部分
if rgb[2] >= 230:
return True
# 判断水印灰色部分
elif sum(rgb) >= 560:
return True
return False
# 提高清晰度
rotate = int(0)
zoom_x = 2
zoom_y = 2
mat = fitz.Matrix(zoom_x, zoom_y)
# pixmap = pdf[page_num].get_pixmap(matrix=mat, alpha=False)
for page_num in trange(pdf.page_count):
# 将每一页转换为图片pixmap
pixmap = pdf[page_num].get_pixmap(matrix=mat, alpha=False)
for pos in product(range(pixmap.width), range(pixmap.height)):
# 删除指定高度以上的内容
if pos[1] <= 75:
pixmap.set_pixel(pos[0],pos[1],(255,255,255))
continue
# 删除指定左下角的内容
elif pos[0] <= 600 and pos[1] >= 1615:
pixmap.set_pixel(pos[0],pos[1],(255,255,255))
continue
rgb = pixmap.pixel(pos[0],pos[1])
# 判断该像素点是否置为白色
if check_rgb(rgb) == True:
pixmap.set_pixel(pos[0],pos[1],(255,255,255))
pixmap.pil_save(f'./pic/{page_num}.png')
2. 图片合并并生成pdf
# 过滤掉当前目录除jpg以外的文件
def file_filter(tmp):
if tmp[-4:] in [".png"]:
return True
return False
jpg_names = list(filter(file_filter, os.listdir("./")))
# 按照数字大小排序,避免顺序错误
def cmp(x, y):
return int(x[0:-4]) - int(y[0:-4])
jpg_names = sorted(jpg_names, key=cmp_to_key(cmp))
# 将jpg转成pdf后合并
pdf = fitz.open()
for i in range(len(jpg_names)):
pdfbytes = fitz.open(jpg_names[i]).convert_to_pdf()
imgpdf = fitz.open(f"{jpg_names[i][0:-4]}.pdf", pdfbytes)
pdf.insert_pdf(imgpdf)
pdf.save("combined.pdf")
pdf.close()