PDF转换提取文字QT与python的简单应用笔记
原文链接:https://blog.csdn.net/XMG9017/article/details/126782483?spm=1001.2014.3001.5501
1 # -*- coding: utf-8 -*- 2 3 # Form implementation generated from reading ui file 'PDFto_txt.ui' 4 # 5 # Created by: PyQt5 UI code generator 5.15.4 6 # 7 # WARNING: Any manual changes made to this file will be lost when pyuic5 is 8 # run again. Do not edit this file unless you know what you are doing. 9 10 11 from PyQt5 import QtCore, QtGui, QtWidgets 12 13 14 class Ui_Form(object): 15 def setupUi(self, Form): 16 Form.setObjectName("Form") 17 Form.resize(791, 507) 18 Form.setFocusPolicy(QtCore.Qt.NoFocus) 19 self.groupBox = QtWidgets.QGroupBox(Form) 20 self.groupBox.setGeometry(QtCore.QRect(80, 20, 611, 421)) 21 self.groupBox.setObjectName("groupBox") 22 self.pushButton = QtWidgets.QPushButton(self.groupBox) 23 self.pushButton.setGeometry(QtCore.QRect(190, 90, 291, 41)) 24 self.pushButton.setObjectName("pushButton") 25 self.label = QtWidgets.QLabel(self.groupBox) 26 self.label.setGeometry(QtCore.QRect(50, 140, 81, 31)) 27 self.label.setObjectName("label") 28 self.lineEdit = QtWidgets.QLineEdit(self.groupBox) 29 self.lineEdit.setGeometry(QtCore.QRect(130, 140, 451, 31)) 30 self.lineEdit.setObjectName("lineEdit") 31 self.pushButton_2 = QtWidgets.QPushButton(self.groupBox) 32 self.pushButton_2.setGeometry(QtCore.QRect(130, 200, 91, 41)) 33 self.pushButton_2.setObjectName("pushButton_2") 34 self.pushButton_3 = QtWidgets.QPushButton(self.groupBox) 35 self.pushButton_3.setGeometry(QtCore.QRect(130, 280, 91, 41)) 36 self.pushButton_3.setObjectName("pushButton_3") 37 self.pushButton_4 = QtWidgets.QPushButton(self.groupBox) 38 self.pushButton_4.setGeometry(QtCore.QRect(430, 280, 91, 41)) 39 self.pushButton_4.setObjectName("pushButton_4") 40 self.pushButton_5 = QtWidgets.QPushButton(self.groupBox) 41 self.pushButton_5.setGeometry(QtCore.QRect(280, 200, 91, 41)) 42 self.pushButton_5.setObjectName("pushButton_5") 43 self.pushButton_6 = QtWidgets.QPushButton(self.groupBox) 44 self.pushButton_6.setGeometry(QtCore.QRect(430, 200, 91, 41)) 45 self.pushButton_6.setObjectName("pushButton_6") 46 self.label_2 = QtWidgets.QLabel(self.groupBox) 47 self.label_2.setGeometry(QtCore.QRect(70, 30, 521, 41)) 48 self.label_2.setText("") 49 self.label_2.setObjectName("label_2") 50 51 self.retranslateUi(Form) 52 QtCore.QMetaObject.connectSlotsByName(Form) 53 54 def retranslateUi(self, Form): 55 _translate = QtCore.QCoreApplication.translate 56 Form.setWindowTitle(_translate("Form", "PDF转换工具")) 57 self.groupBox.setTitle(_translate("Form", "主菜单")) 58 self.pushButton.setText(_translate("Form", "选择文件(*.pdf)")) 59 self.label.setText(_translate("Form", "待处理文件")) 60 self.label_2.setText(_translate("Form", "欢迎使用PDF转换工具")) 61 self.pushButton_2.setText(_translate("Form", "pdf转txt")) 62 self.pushButton_3.setText(_translate("Form", "打开处理结果")) 63 self.pushButton_4.setText(_translate("Form", "退出")) 64 self.pushButton_5.setText(_translate("Form", "pdf转word")) 65 self.pushButton_6.setText(_translate("Form", "pdf转excel"))
import pdfplumber import pandas as pd import os import time import shutil if os.path.isdir('处理结果'): shutil.rmtree('处理结果') os.makedirs('处理结果') # os.mkdir('处理结果') from PyQt5 import QtWidgets from PyQt5.QtCore import QFileInfo from PyQt5.QtWidgets import QFileDialog, QMessageBox from PDFto_txt import Ui_Form class mywindow(QtWidgets.QWidget, Ui_Form): def __init__(self): super(mywindow, self).__init__() self.setupUi(self) ########################################上边区域代码基本通用(都是这样调用QT的界面代码) self.pushButton.clicked.connect(self.shuruwenjianjia)#QT槽和信号的函数调用 self.pushButton_3.clicked.connect(self.DKJG) self.pushButton_2.clicked.connect(self.pdf_txt) self.pushButton_4.clicked.connect(self.jieshu) self.pushButton_5.clicked.connect(self.pdf_word) self.pushButton_6.clicked.connect(self.pdf_excel) ###############################################################下方为def函数区域 #打开处理结果 def DKJG(self): import os start_directory = ('处理结果') os.startfile(start_directory) def pdf_txt(self): with pdfplumber.open(fileName) as pdf: content = '' # len(pdf.pages)为PDF文档页数 for i in range(len(pdf.pages)): # pdf.pages[i] 是读取PDF文档第i+1页 page = pdf.pages[i] # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码 page_content = '\n'.join(page.extract_text().split('\n')[:-1]) content = content + page_content with open("处理结果\pdf-txt.txt", 'w') as f: zhuanhuan = f.write(content) print('处理完成') # self.label_2.setText(_translate("Form", "处理完成")) self.label_2.setText('pdf转换txt处理完成!') def pdf_word(self): with pdfplumber.open(fileName) as pdf: content = '' # len(pdf.pages)为PDF文档页数 for i in range(len(pdf.pages)): # pdf.pages[i] 是读取PDF文档第i+1页 page = pdf.pages[i] # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码 page_content = '\n'.join(page.extract_text().split('\n')[:-1]) content = content + page_content with open("处理结果\pdf-word.docx", 'w') as f: zhuanhuan = f.write(content) print('处理完成') # self.label_2.setText(_translate("Form", "处理完成")) self.label_2.setText('pdf转换word处理完成!') def pdf_excel(self): with pdfplumber.open(fileName) as pdf: content = '' # len(pdf.pages)为PDF文档页数 for i in range(len(pdf.pages)): # pdf.pages[i] 是读取PDF文档第i+1页 page = pdf.pages[i] # page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码 page_content = '\n'.join(page.extract_text().split('\n')[:-1]) content = content + page_content with open("处理结果\pdf-word.xlsx", 'w') as f: zhuanhuan = f.write(content) print('处理完成') # self.label_2.setText(_translate("Form", "处理完成")) self.label_2.setText('pdf转换excel处理完成!') def shuruwenjianjia(self): # shuru_lujing = QFileDialog.getExistingDirectory(self, "选择文件夹", "/") global fileName fileName, filetype = QFileDialog.getOpenFileName(self, "选择PDF文件", "/", "Text Files (*.pdf)") print(str(fileName)) # 打印文件全部路径(包括文件名和后缀名)和文件类型 # print(shuru_lujing) # fileinfo = QFileInfo(fileName) # file_path = fileinfo.absolutePath() # print(file_path) self.lineEdit.setText(fileName) def jieshu(self): import os # os.exit() os.close() if __name__ == "__main__": import sys app = QtWidgets.QApplication(sys.argv) ui = mywindow() ui.show() sys.exit(app.exec_())