PDF转化为txt文件
环境
python>=3.6
包
pip install pdfminer.six
直接贴代码:
#!/usr/bin/env python3.8.6 # _*_ coding: utf-8 _*_ # Description: # Author: qiaoxiaohang <qiaoxiaohang@beyondsoft.com> # Date: 2023/4/12 18:20 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.converter import TextConverter, PDFPageAggregator from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfpage import PDFPage # 获取pdf文档 def fun_pdf(url,name): fp = open(f'{url}', 'rb') # 创建一个与文档相关的解释器 parser = PDFParser(fp) # pdf文档的对象,与解释器连接起来 doc = PDFDocument(parser=parser) parser.set_document(doc=doc) # 如果是加密pdf,则输入密码 # doc._initialize_password() # 创建pdf资源管理器 resource = PDFResourceManager() # 参数分析器 laparam = LAParams() # 创建一个聚合器 device = PDFPageAggregator(resource, laparams=laparam) # 创建pdf页面解释器 interpreter = PDFPageInterpreter(resource, device) # 获取页面的集合 for page in PDFPage.get_pages(fp): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器来获取内容 layout = device.get_result() for out in layout: if hasattr(out, 'get_text'): print(out.get_text()) # 写入txt文件 fw = open(f'{name}', 'a',encoding='utf-8') fw.write(out.get_text()) if __name__ == '__main__': import os dir_path = 'C:\\Users\Administrator\Desktop\\test\pdf' data_list = os.listdir(dir_path) for i in data_list: name_txt = i.split('.')[0]+'.'+'txt' url=dir_path+'\\'+i fun_pdf(url,name_txt)