python: word covert pdf

100

101

102

103

104

105

106

# encoding: utf-8
# 版权所有 2023 ©涂聚文有限公司
# 许可信息查看： word covert pdf
# 描述：pip install pywin32
#
#      pip install PyPDF2
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 311
# Datetime  : 2023/10/26 14:51
# User      : geovindu
# Product   : PyCharm
# Project   : pythonWebScreenShot
# File      : WordCovertPdf.py
# explain   : 学习
 
import os  # 导入系统功能模块
from win32com.client import Dispatch, DispatchEx  # 导入win32com模块的client包下的函数
from win32com.client import constants  #  导入win32com模块的client包下的保存COM常量的类
from win32com.client import gencache    #  导入win32com模块的client包下的gencache函数
import re  # 导入正则表达式模块
import pythoncom  # 导入封装了OLE自动化API的模块，该模块为win32com的子模块
 
class WordTPdf(object):
    """
    WORD转成PDF
    """
    def __init__(self):
        self.dic=""
        self.filename=""
 
    def wordtopdf(self,filelist, targetpath):
        """
        WORD转成PDF
        :param filelist:  word 列表
        :param targetpath: pdf 列表
        :return:
        """
        valueList = []
        try:
                # 调用线程初始化COM库，解决调用Word 2007时出现“尚未调用CoInitialize”错误的问题
                pythoncom.CoInitialize()
                gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
                # 开始转换
                w = Dispatch("Word.Application")
                for fullfilename in filelist:
                    (filepath, filename) = os.path.split(fullfilename)  # 分割文件路径和文件名，其中，filepath表示文件路径；filename表示文件名
                    softfilename = os.path.splitext(filename)  # 分割文件名和扩展名
                    os.chdir(filepath)
                    doc = os.path.abspath(filename)
                    os.chdir(targetpath)
                    pdfname = softfilename[0] + ".pdf"
                    output = os.path.abspath(pdfname)
                    pdf_name = output
                    # 文档路径需要为绝对路径，因为Word启动后当前路径不是调用脚本时的当前路径。
                    try:  # 捕捉异常
                        doc = w.Documents.Open(doc, ReadOnly=1)
                        doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF,\
                                                Item=constants.wdExportDocumentWithMarkup,
                                                CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
 
                    except Exception as e:  # 处理异常
                           print(e)
                    if os.path.isfile(pdf_name):  # 判断文件是否存在
                          valueList.append(pdf_name)  # 添加到文件列表中
                    else:
                        print('转换失败！')
                        return False
 
                w.Quit(constants.wdDoNotSaveChanges)  # 退出Word应用程序
                return valueList  # 返回生成PDF文件列表
 
        except TypeError as e:
                print('出错了！')
                print(e)
                return False
 
 
    def getfilenames(self,filepath='', filelist_out=[], file_ext='all'):
        """
 
        :param filepath: 要转换的文件路径
        :param filelist_out: 转换成PDF的列表
        :param file_ext:  扩展名
        :return: 返回转换成PDF的列表
        """
        # 遍历filepath下的所有文件，包括子目录下的文件
 
        #print(filepath)
        #print(filelist_out)
        for fpath, dirs, fs in os.walk(filepath):
            for f in fs:
                fi_d = os.path.join(fpath, f)
                if file_ext == '.doc':  # 遍历Word文档文件
                    if os.path.splitext(fi_d)[1] in ['.doc', '.docx']:  # 判断是否为Word文件
                        filelist_out.append(re.sub(r'\\', '/', fi_d))  # 添加到路径列表中
                else:
                    if file_ext == 'all':  # 要获取所有文件的情况
                        #print(fi_d)
                        filelist_out.append(fi_d)  # 将文件路径添加到路径列表中
                    elif os.path.splitext(fi_d)[1] == file_ext:  # 要获取除了Word文件以外的文件
                        filelist_out.append(fi_d)  # 将文件路径添加到路径列表中
                    else:
                        pass
            filelist_out.sort()  # 对路径进行排序
            #print(filelist_out)
        return filelist_out  # 返回文件完整路径列表

调用：

doc=BLL.WordCovertPdf.WordTPdf()
sourcepath = r"C:/Users/geovindu/PycharmProjects/pythonWebScreenShot/doc/"  # 指定源路径（Word文档所在路径）
targetpath = r"C:/Users/geovindu/PycharmProjects/pythonWebScreenShot/pdf/"  # 指定目标路径（PDF保存路径）
filelist =doc.getfilenames(sourcepath, [], 'all')  # 获取Word文档路径 .docx
valueList =doc.wordtopdf(filelist, targetpath)  # 实现将Word文档批量转换为PDF
if valueList:
    print("转换成功！")
else:
    print("没有要转换的Word文档或者转换失败！")

def extract_text_from_pdf(self,file_path):
    """
    pdf 提取文本# pip install PyPDF2
    :param file_path:
    :return:
    """
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        text = ''
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
        return text
 
def merge_pdfs(self,input_paths, output_path):
    """
    合并PDF文档
    :param input_paths:
    :param output_path:
    :return:
    """
    pdf_merger = PyPDF2.PdfMerger()
    for path in input_paths:
        with open(path, 'rb') as f:
            pdf_merger.append(f)
        with open(output_path, 'wb') as f:
            pdf_merger.write(f)
 
def add_password_protection(self,input_path, output_path, password):
    """
    密码保护
    :param input_path:
    :param output_path:
    :param password:
    :return:
    """
    with open(input_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        pdf_writer = PyPDF2.PdfFileWriter()
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
        pdf_writer.addPage(page)
        pdf_writer.encrypt(password)
        with open(output_path, 'wb') as output_file:
            pdf_writer.write(output_file)

posted @ 2023-10-26 15:39 ®Geovin Du Dream Park™ 阅读(29) 评论(0) 编辑收藏举报

®Geovin Du Dream Park™

why we only heard about haves and have-nots, but we did'nt heard about doers and doer-nots. 人生是一种心境,生活是一种艺术,成功是一种心态,幸福是一种感觉,竞争是一种建构,情感是一种容合.学习是一种成长.

python: word covert pdf

公告

搜索

常用链接

我的标签

积分与排名

随笔分类

随笔档案

文章分类

相册

捷为工作室

阅读排行榜

评论排行榜

推荐排行榜

最新评论