读pdf文本

# -*- coding: utf-8 -*-
# @Time : 2021/11/12 9:18
# @Author : wangyafeng
# @FileName: readpdf.py
# @Email : yafengwang@dingtalk.com
# @Software: PyCharm
#from https://www.cnblogs.com/wj-1314/p/9429816.html
#fp = urlopen('https://www.tencent.com/zh-cn/articles/802741466496787.pdf')
'''
PDFParser     从文件中获取数据
PDFDocument   存储文档数据结构到内存中
PDFPageInterpreter 解析page内容
PDFDevice    把解析到的内容转化为你需要的东西
PDFResourceManager 存储共享资源,例如字体或图片
'''

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed


def parse(Path, Save_name):
    parser = PDFParser(Path)    # 创建一个与文档关联的解析器
    document = PDFDocument(parser)    #创建一个PDF文档对象

    if not document.is_extractable:    #判断是否可以读取
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    with open('%s' % Save_name, 'a') as f:
                        results = x.get_text().encode('utf-8')
                        f.write(str(results, 'utf-8') + "\n")


if __name__ == '__main__':
    Path = open('双减心得.pdf', 'rb')
    parse(Path, '1.txt')

 

posted @ 2021-11-12 15:46  王亚锋  阅读(39)  评论(0编辑  收藏  举报