pdfminer

遍历Pdf每一页

from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import *

fp = open('test.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, '')

if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager(caching = False)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()  
    for x in layout:  
        if(isinstance(x, LTTextBoxHorizontal)):  
            print(len(x.get_text()))

获取pdf目录

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

fp = open('test.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, '')

outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
        print(level, title)
posted @ 2016-07-25 23:07  4Thing  阅读(322)  评论(0编辑  收藏  举报