python: word covert pdf

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# encoding: utf-8
# 版权所有 2023 ©涂聚文有限公司
# 许可信息查看: word covert pdf
# 描述:pip install pywin32
#
#      pip install PyPDF2
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 311
# Datetime  : 2023/10/26 14:51
# User      : geovindu
# Product   : PyCharm
# Project   : pythonWebScreenShot
# File      : WordCovertPdf.py
# explain   : 学习
 
import os  # 导入系统功能模块
from win32com.client import Dispatch, DispatchEx  # 导入win32com模块的client包下的函数
from win32com.client import constants  #  导入win32com模块的client包下的保存COM常量的类
from win32com.client import gencache    #  导入win32com模块的client包下的gencache函数
import re  # 导入正则表达式模块
import pythoncom  # 导入封装了OLE自动化API的模块,该模块为win32com的子模块
 
class WordTPdf(object):
    """
    WORD转成PDF
    """
    def __init__(self):
        self.dic=""
        self.filename=""
 
    def wordtopdf(self,filelist, targetpath):
        """
        WORD转成PDF
        :param filelist:  word 列表
        :param targetpath: pdf 列表
        :return:
        """
        valueList = []
        try:
                # 调用线程初始化COM库,解决调用Word 2007时出现“尚未调用CoInitialize”错误的问题
                pythoncom.CoInitialize()
                gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
                # 开始转换
                w = Dispatch("Word.Application")
                for fullfilename in filelist:
                    (filepath, filename) = os.path.split(fullfilename)  # 分割文件路径和文件名,其中,filepath表示文件路径;filename表示文件名
                    softfilename = os.path.splitext(filename)  # 分割文件名和扩展名
                    os.chdir(filepath)
                    doc = os.path.abspath(filename)
                    os.chdir(targetpath)
                    pdfname = softfilename[0] + ".pdf"
                    output = os.path.abspath(pdfname)
                    pdf_name = output
                    # 文档路径需要为绝对路径,因为Word启动后当前路径不是调用脚本时的当前路径。
                    try# 捕捉异常
                        doc = w.Documents.Open(doc, ReadOnly=1)
                        doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF,\
                                                Item=constants.wdExportDocumentWithMarkup,
                                                CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
 
                    except Exception as e:  # 处理异常
                           print(e)
                    if os.path.isfile(pdf_name):  # 判断文件是否存在
                          valueList.append(pdf_name)  # 添加到文件列表中
                    else:
                        print('转换失败!')
                        return False
 
                w.Quit(constants.wdDoNotSaveChanges)  # 退出Word应用程序
                return valueList  # 返回生成PDF文件列表
 
        except TypeError as e:
                print('出错了!')
                print(e)
                return False
 
 
    def getfilenames(self,filepath='', filelist_out=[], file_ext='all'):
        """
 
        :param filepath: 要转换的文件路径
        :param filelist_out: 转换成PDF的列表
        :param file_ext:  扩展名
        :return: 返回转换成PDF的列表
        """
        # 遍历filepath下的所有文件,包括子目录下的文件
 
        #print(filepath)
        #print(filelist_out)
        for fpath, dirs, fs in os.walk(filepath):
            for f in fs:
                fi_d = os.path.join(fpath, f)
                if file_ext == '.doc'# 遍历Word文档文件
                    if os.path.splitext(fi_d)[1] in ['.doc', '.docx']:  # 判断是否为Word文件
                        filelist_out.append(re.sub(r'\\', '/', fi_d))  # 添加到路径列表中
                else:
                    if file_ext == 'all'# 要获取所有文件的情况
                        #print(fi_d)
                        filelist_out.append(fi_d)  # 将文件路径添加到路径列表中
                    elif os.path.splitext(fi_d)[1] == file_ext:  # 要获取除了Word文件以外的文件
                        filelist_out.append(fi_d)  # 将文件路径添加到路径列表中
                    else:
                        pass
            filelist_out.sort()  # 对路径进行排序
            #print(filelist_out)
        return filelist_out  # 返回文件完整路径列表

  

调用:

1
2
3
4
5
6
7
8
9
doc=BLL.WordCovertPdf.WordTPdf()
sourcepath = r"C:/Users/geovindu/PycharmProjects/pythonWebScreenShot/doc/"  # 指定源路径(Word文档所在路径)
targetpath = r"C:/Users/geovindu/PycharmProjects/pythonWebScreenShot/pdf/"  # 指定目标路径(PDF保存路径)
filelist =doc.getfilenames(sourcepath, [], 'all'# 获取Word文档路径 .docx
valueList =doc.wordtopdf(filelist, targetpath)  # 实现将Word文档批量转换为PDF
if valueList:
    print("转换成功!")
else:
    print("没有要转换的Word文档或者转换失败!")

  

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def extract_text_from_pdf(self,file_path):
    """
    pdf 提取文本# pip install PyPDF2
    :param file_path:
    :return:
    """
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        text = ''
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
        return text
 
def merge_pdfs(self,input_paths, output_path):
    """
    合并PDF文档
    :param input_paths:
    :param output_path:
    :return:
    """
    pdf_merger = PyPDF2.PdfMerger()
    for path in input_paths:
        with open(path, 'rb') as f:
            pdf_merger.append(f)
        with open(output_path, 'wb') as f:
            pdf_merger.write(f)
 
def add_password_protection(self,input_path, output_path, password):
    """
    密码保护
    :param input_path:
    :param output_path:
    :param password:
    :return:
    """
    with open(input_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        pdf_writer = PyPDF2.PdfFileWriter()
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
        pdf_writer.addPage(page)
        pdf_writer.encrypt(password)
        with open(output_path, 'wb') as output_file:
            pdf_writer.write(output_file)

  

 

posted @   ®Geovin Du Dream Park™  阅读(29)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源!
· 周边上新:园子的第一款马克杯温暖上架
历史上的今天:
2022-10-26 Python: State Pattern
2022-10-26 Python: Template Method Pattern
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示