python: word covert pdf

 

# encoding: utf-8
# 版权所有 2023 ©涂聚文有限公司
# 许可信息查看: word covert pdf
# 描述:pip install pywin32
#
#      pip install PyPDF2
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 311
# Datetime  : 2023/10/26 14:51
# User      : geovindu
# Product   : PyCharm
# Project   : pythonWebScreenShot
# File      : WordCovertPdf.py
# explain   : 学习

import os  # 导入系统功能模块
from win32com.client import Dispatch, DispatchEx  # 导入win32com模块的client包下的函数
from win32com.client import constants  #  导入win32com模块的client包下的保存COM常量的类
from win32com.client import gencache    #  导入win32com模块的client包下的gencache函数
import re  # 导入正则表达式模块
import pythoncom  # 导入封装了OLE自动化API的模块,该模块为win32com的子模块

class WordTPdf(object):
    """
    WORD转成PDF
    """
    def __init__(self):
        self.dic=""
        self.filename=""

    def wordtopdf(self,filelist, targetpath):
        """
        WORD转成PDF
        :param filelist:  word 列表
        :param targetpath: pdf 列表
        :return:
        """
        valueList = []
        try:
                # 调用线程初始化COM库,解决调用Word 2007时出现“尚未调用CoInitialize”错误的问题
                pythoncom.CoInitialize()
                gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
                # 开始转换
                w = Dispatch("Word.Application")
                for fullfilename in filelist:
                    (filepath, filename) = os.path.split(fullfilename)  # 分割文件路径和文件名,其中,filepath表示文件路径;filename表示文件名
                    softfilename = os.path.splitext(filename)  # 分割文件名和扩展名
                    os.chdir(filepath)
                    doc = os.path.abspath(filename)
                    os.chdir(targetpath)
                    pdfname = softfilename[0] + ".pdf"
                    output = os.path.abspath(pdfname)
                    pdf_name = output
                    # 文档路径需要为绝对路径,因为Word启动后当前路径不是调用脚本时的当前路径。
                    try:  # 捕捉异常
                        doc = w.Documents.Open(doc, ReadOnly=1)
                        doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF,\
                                                Item=constants.wdExportDocumentWithMarkup,
                                                CreateBookmarks=constants.wdExportCreateHeadingBookmarks)

                    except Exception as e:  # 处理异常
                           print(e)
                    if os.path.isfile(pdf_name):  # 判断文件是否存在
                          valueList.append(pdf_name)  # 添加到文件列表中
                    else:
                        print('转换失败!')
                        return False

                w.Quit(constants.wdDoNotSaveChanges)  # 退出Word应用程序
                return valueList  # 返回生成PDF文件列表

        except TypeError as e:
                print('出错了!')
                print(e)
                return False


    def getfilenames(self,filepath='', filelist_out=[], file_ext='all'):
        """

        :param filepath: 要转换的文件路径
        :param filelist_out: 转换成PDF的列表
        :param file_ext:  扩展名
        :return: 返回转换成PDF的列表
        """
        # 遍历filepath下的所有文件,包括子目录下的文件

        #print(filepath)
        #print(filelist_out)
        for fpath, dirs, fs in os.walk(filepath):
            for f in fs:
                fi_d = os.path.join(fpath, f)
                if file_ext == '.doc':  # 遍历Word文档文件
                    if os.path.splitext(fi_d)[1] in ['.doc', '.docx']:  # 判断是否为Word文件
                        filelist_out.append(re.sub(r'\\', '/', fi_d))  # 添加到路径列表中
                else:
                    if file_ext == 'all':  # 要获取所有文件的情况
                        #print(fi_d)
                        filelist_out.append(fi_d)  # 将文件路径添加到路径列表中
                    elif os.path.splitext(fi_d)[1] == file_ext:  # 要获取除了Word文件以外的文件
                        filelist_out.append(fi_d)  # 将文件路径添加到路径列表中
                    else:
                        pass
            filelist_out.sort()  # 对路径进行排序
            #print(filelist_out)
        return filelist_out  # 返回文件完整路径列表

  

调用:

    doc=BLL.WordCovertPdf.WordTPdf()
    sourcepath = r"C:/Users/geovindu/PycharmProjects/pythonWebScreenShot/doc/"  # 指定源路径(Word文档所在路径)
    targetpath = r"C:/Users/geovindu/PycharmProjects/pythonWebScreenShot/pdf/"  # 指定目标路径(PDF保存路径)
    filelist =doc.getfilenames(sourcepath, [], 'all')  # 获取Word文档路径 .docx
    valueList =doc.wordtopdf(filelist, targetpath)  # 实现将Word文档批量转换为PDF
    if valueList:
        print("转换成功!")
    else:
        print("没有要转换的Word文档或者转换失败!")

  

 

    def extract_text_from_pdf(self,file_path):
        """
        pdf 提取文本# pip install PyPDF2
        :param file_path:
        :return:
        """
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfFileReader(f)
            text = ''
            for page_num in range(pdf_reader.numPages):
                page = pdf_reader.getPage(page_num)
                text += page.extractText()
            return text

    def merge_pdfs(self,input_paths, output_path):
        """
        合并PDF文档
        :param input_paths:
        :param output_path:
        :return:
        """
        pdf_merger = PyPDF2.PdfMerger()
        for path in input_paths:
            with open(path, 'rb') as f:
                pdf_merger.append(f)
            with open(output_path, 'wb') as f:
                pdf_merger.write(f)

    def add_password_protection(self,input_path, output_path, password):
        """
        密码保护
        :param input_path:
        :param output_path:
        :param password:
        :return:
        """
        with open(input_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfFileReader(f)
            pdf_writer = PyPDF2.PdfFileWriter()
            for page_num in range(pdf_reader.numPages):
                page = pdf_reader.getPage(page_num)
            pdf_writer.addPage(page)
            pdf_writer.encrypt(password)
            with open(output_path, 'wb') as output_file:
                pdf_writer.write(output_file)

  

 

posted @ 2023-10-26 15:39  ®Geovin Du Dream Park™  阅读(25)  评论(0编辑  收藏  举报