读pdf文本
# -*- coding: utf-8 -*- # @Time : 2021/11/12 9:18 # @Author : wangyafeng # @FileName: readpdf.py # @Email : yafengwang@dingtalk.com # @Software: PyCharm #from https://www.cnblogs.com/wj-1314/p/9429816.html #fp = urlopen('https://www.tencent.com/zh-cn/articles/802741466496787.pdf') ''' PDFParser 从文件中获取数据 PDFDocument 存储文档数据结构到内存中 PDFPageInterpreter 解析page内容 PDFDevice 把解析到的内容转化为你需要的东西 PDFResourceManager 存储共享资源,例如字体或图片 ''' from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfdevice import PDFDevice from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams from pdfminer.pdfpage import PDFTextExtractionNotAllowed def parse(Path, Save_name): parser = PDFParser(Path) # 创建一个与文档关联的解析器 document = PDFDocument(parser) #创建一个PDF文档对象 if not document.is_extractable: #判断是否可以读取 raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): with open('%s' % Save_name, 'a') as f: results = x.get_text().encode('utf-8') f.write(str(results, 'utf-8') + "\n") if __name__ == '__main__': Path = open('双减心得.pdf', 'rb') parse(Path, '1.txt')