python-pdf2image

关于使用python将pdf转图片,网上大部分教程在讲pdf2image包
https://pypi.org/project/pdf2image/

image

它需要用到一个poppler的程序,但是给的下载链接都过期了,去github下载最新项目发现路径下只有Library、share

https://gh.jiasu.in/https://github.com/oschwartz10612/poppler-windows/releases/download/v0.90.1/release.zip
安装完成记得重启电脑

from pdf2image import convert_from_path
from PIL import Image
import shutil
import os

global index

def convert_pdf_to_jpg(pdf_path, output_folder):
    """
    将PDF文件转换为JPG图像。
    
    参数:
    pdf_path -- PDF文件的路径
    output_folder -- 保存转换后JPG图像的文件夹路径
    """
    global index
    # 确保输出目录存在
    os.makedirs(output_folder, exist_ok=True)
    images = convert_from_path(pdf_path)
    for i, image in enumerate(images):
        jpg_filename = os.path.join(output_folder, f"{index}.jpg")
        image.save(jpg_filename, 'JPEG')
        # print(f"{index}.jpg")
        index += 1
    # return index

def getAllSub(path, dirlist=[], filelist=[]):
    """
    递归获取所有文件目录、和文件
    
    参数:
    path -- 要获取所有文件的目录
    dirlist -- 保存所有文件夹路径
    filelist -- 保存所有文件路径
    """
    global index
    flist = os.listdir(path)
    print(f"正在查询路径: {path}")
    for filename in flist:
        subpath = os.path.join(path, filename)
        if os.path.isdir(subpath):
            # 如果是文件夹,添加到文件夹列表中
            # dirlist.append(subpath)		
            print("给下个目录的参数", index)
            getAllSub(subpath)	# 向子文件内递归
        if os.path.isfile(subpath):
            # 如果是文件,添加到文件列表中
            # filelist.append(subpath)
            file_tail = filename[-3:]
            if file_tail == 'pdf':
                # index = convert_pdf_to_jpg(index, subpath, output_dir)
                convert_pdf_to_jpg(subpath, output_dir)
            elif file_tail == 'jpg':
                jpg_filename = os.path.join(output_dir, f"{index}.jpg")
                shutil.copy(subpath, jpg_filename)
                # print(f"{index}.jpg")
                index += 1
            elif file_tail == 'png':
                with Image.open(subpath) as im:
                    # 构建JPG输出文件名
                    jpg_filename = os.path.join(output_dir, f"{index}.jpg")
                    # 保存为JPG
                    im.save(jpg_filename, "JPEG")
                    # print(f"{index}.jpg")
                    index += 1
            else:
                print(file_tail)

    return dirlist, filelist

# 要处理的根目录
directory_path = ''
# 输出文件下标
index = 1
# 输出JPG文件的文件夹路径
output_dir = ''
dirlist, filelist = getAllSub(directory_path)
# print(filelist)

posted @ 2024-05-18 09:45  anyiya  阅读(50)  评论(0编辑  收藏  举报