pdfminer批量处理PDF文件

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTTextLineHorizontal, LTFigure, LTRect, LTLine, LTCurve
import os


class PdfForString(object):
    def __init__(self):
        self.pdf_list = os.listdir(r'E:\StockExchange\PDF')  # 获取PDF文件夹中所有pdf名称
        #  存储文档资源
        self.src = PDFResourceManager()
        #  设备对象
        self.device = PDFPageAggregator(self.src, laparams=LAParams())
        # 解释器对象
        self.inter = PDFPageInterpreter(self.src, self.device)

    # 生成pdf路径
    def for_string(self):
        for pdf in self.pdf_list:
            pdf_path = os.path.join(os.path.dirname(os.path.dirname(__file__)) + '/PDF', pdf)
            yield pdf_path

    # 解析pdf
    def pdf_analysis(self):
        for path in self.for_string():
            pd_file = open(path, 'rb')
            parser = PDFParser(pd_file)  # pdf文件解析对象

            #  pdf文档对象
            document = PDFDocument()
            parser.set_document(document)
            document.set_parser(parser)
            pages = document.get_pages()
            yield pages

    # 获取PDF信息
    def get_string(self):
        for pages in self.pdf_analysis():
            for page in pages:
                self.inter.process_page(page)
                layout = self.device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):
                        print(str(x.get_text()))
            # break


PdfForString().get_string()

  

posted @ 2019-11-28 10:00  屁桃  阅读(938)  评论(1编辑  收藏  举报