PDF转换提取文字QT与python的简单应用笔记

原文链接:https://blog.csdn.net/XMG9017/article/details/126782483?spm=1001.2014.3001.5501

 

 

复制代码
 1 # -*- coding: utf-8 -*-
 2  
 3 # Form implementation generated from reading ui file 'PDFto_txt.ui'
 4 #
 5 # Created by: PyQt5 UI code generator 5.15.4
 6 #
 7 # WARNING: Any manual changes made to this file will be lost when pyuic5 is
 8 # run again.  Do not edit this file unless you know what you are doing.
 9  
10  
11 from PyQt5 import QtCore, QtGui, QtWidgets
12  
13  
14 class Ui_Form(object):
15     def setupUi(self, Form):
16         Form.setObjectName("Form")
17         Form.resize(791, 507)
18         Form.setFocusPolicy(QtCore.Qt.NoFocus)
19         self.groupBox = QtWidgets.QGroupBox(Form)
20         self.groupBox.setGeometry(QtCore.QRect(80, 20, 611, 421))
21         self.groupBox.setObjectName("groupBox")
22         self.pushButton = QtWidgets.QPushButton(self.groupBox)
23         self.pushButton.setGeometry(QtCore.QRect(190, 90, 291, 41))
24         self.pushButton.setObjectName("pushButton")
25         self.label = QtWidgets.QLabel(self.groupBox)
26         self.label.setGeometry(QtCore.QRect(50, 140, 81, 31))
27         self.label.setObjectName("label")
28         self.lineEdit = QtWidgets.QLineEdit(self.groupBox)
29         self.lineEdit.setGeometry(QtCore.QRect(130, 140, 451, 31))
30         self.lineEdit.setObjectName("lineEdit")
31         self.pushButton_2 = QtWidgets.QPushButton(self.groupBox)
32         self.pushButton_2.setGeometry(QtCore.QRect(130, 200, 91, 41))
33         self.pushButton_2.setObjectName("pushButton_2")
34         self.pushButton_3 = QtWidgets.QPushButton(self.groupBox)
35         self.pushButton_3.setGeometry(QtCore.QRect(130, 280, 91, 41))
36         self.pushButton_3.setObjectName("pushButton_3")
37         self.pushButton_4 = QtWidgets.QPushButton(self.groupBox)
38         self.pushButton_4.setGeometry(QtCore.QRect(430, 280, 91, 41))
39         self.pushButton_4.setObjectName("pushButton_4")
40         self.pushButton_5 = QtWidgets.QPushButton(self.groupBox)
41         self.pushButton_5.setGeometry(QtCore.QRect(280, 200, 91, 41))
42         self.pushButton_5.setObjectName("pushButton_5")
43         self.pushButton_6 = QtWidgets.QPushButton(self.groupBox)
44         self.pushButton_6.setGeometry(QtCore.QRect(430, 200, 91, 41))
45         self.pushButton_6.setObjectName("pushButton_6")
46         self.label_2 = QtWidgets.QLabel(self.groupBox)
47         self.label_2.setGeometry(QtCore.QRect(70, 30, 521, 41))
48         self.label_2.setText("")
49         self.label_2.setObjectName("label_2")
50  
51         self.retranslateUi(Form)
52         QtCore.QMetaObject.connectSlotsByName(Form)
53  
54     def retranslateUi(self, Form):
55         _translate = QtCore.QCoreApplication.translate
56         Form.setWindowTitle(_translate("Form", "PDF转换工具"))
57         self.groupBox.setTitle(_translate("Form", "主菜单"))
58         self.pushButton.setText(_translate("Form", "选择文件(*.pdf)"))
59         self.label.setText(_translate("Form", "待处理文件"))
60         self.label_2.setText(_translate("Form", "欢迎使用PDF转换工具"))
61         self.pushButton_2.setText(_translate("Form", "pdf转txt"))
62         self.pushButton_3.setText(_translate("Form", "打开处理结果"))
63         self.pushButton_4.setText(_translate("Form", "退出"))
64         self.pushButton_5.setText(_translate("Form", "pdf转word"))
65         self.pushButton_6.setText(_translate("Form", "pdf转excel"))


复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pdfplumber
import pandas as pd
import os
import time
import shutil
if os.path.isdir('处理结果'):
    shutil.rmtree('处理结果')
os.makedirs('处理结果')
# os.mkdir('处理结果')
from PyQt5 import QtWidgets
from PyQt5.QtCore import QFileInfo
from PyQt5.QtWidgets import QFileDialog, QMessageBox
from PDFto_txt import Ui_Form
class mywindow(QtWidgets.QWidget, Ui_Form):
    def __init__(self):
  
        super(mywindow, self).__init__()
        self.setupUi(self)
########################################上边区域代码基本通用(都是这样调用QT的界面代码)
        self.pushButton.clicked.connect(self.shuruwenjianjia)#QT槽和信号的函数调用
        self.pushButton_3.clicked.connect(self.DKJG)
        self.pushButton_2.clicked.connect(self.pdf_txt)
        self.pushButton_4.clicked.connect(self.jieshu)
        self.pushButton_5.clicked.connect(self.pdf_word)
        self.pushButton_6.clicked.connect(self.pdf_excel)
  
  
###############################################################下方为def函数区域
    #打开处理结果
    def DKJG(self):
        import os
        start_directory = ('处理结果')
        os.startfile(start_directory)
  
  
  
  
  
    def pdf_txt(self):
        with pdfplumber.open(fileName) as pdf:
            content = ''
            # len(pdf.pages)为PDF文档页数
            for i in range(len(pdf.pages)):
                # pdf.pages[i] 是读取PDF文档第i+1页
                page = pdf.pages[i]
                # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
                page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                content = content + page_content
            with open("处理结果\pdf-txt.txt", 'w') as f:
                zhuanhuan = f.write(content)
                print('处理完成')
                # self.label_2.setText(_translate("Form", "处理完成"))
                self.label_2.setText('pdf转换txt处理完成!')
    def pdf_word(self):
  
        with pdfplumber.open(fileName) as pdf:
            content = ''
            # len(pdf.pages)为PDF文档页数
            for i in range(len(pdf.pages)):
                # pdf.pages[i] 是读取PDF文档第i+1页
                page = pdf.pages[i]
                # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
                page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                content = content + page_content
            with open("处理结果\pdf-word.docx", 'w') as f:
                zhuanhuan = f.write(content)
                print('处理完成')
                # self.label_2.setText(_translate("Form", "处理完成"))
                self.label_2.setText('pdf转换word处理完成!')
  
    def pdf_excel(self):
  
        with pdfplumber.open(fileName) as pdf:
            content = ''
            # len(pdf.pages)为PDF文档页数
            for i in range(len(pdf.pages)):
                # pdf.pages[i] 是读取PDF文档第i+1页
                page = pdf.pages[i]
                # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
                page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                content = content + page_content
            with open("处理结果\pdf-word.xlsx", 'w') as f:
                zhuanhuan = f.write(content)
                print('处理完成')
                # self.label_2.setText(_translate("Form", "处理完成"))
                self.label_2.setText('pdf转换excel处理完成!')
  
  
  
  
  
    def shuruwenjianjia(self):
        # shuru_lujing = QFileDialog.getExistingDirectory(self, "选择文件夹", "/")
        global fileName
        fileName, filetype = QFileDialog.getOpenFileName(self, "选择PDF文件", "/", "Text Files (*.pdf)")
        print(str(fileName))  # 打印文件全部路径(包括文件名和后缀名)和文件类型
        # print(shuru_lujing)
        # fileinfo = QFileInfo(fileName)
        # file_path = fileinfo.absolutePath()
        # print(file_path)
        self.lineEdit.setText(fileName)
    def jieshu(self):
        import os
        # os.exit()
        os.close()
  
if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    ui = mywindow()
    ui.show()
    sys.exit(app.exec_())

  

 

posted @   xiaomage9017  阅读(134)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源!
· 周边上新:园子的第一款马克杯温暖上架
点击右上角即可分享
微信分享提示