pdfminer获取整页文本

 1 #! python2
 2 # coding: utf-8
 3 
 4 import sys
 5 from cStringIO import StringIO
 6 from pdfminer import pdfinterp
 7 from pdfminer import pdfpage
 8 from pdfminer import converter
 9 from pdfminer import layout
10 
11 with file(path, 'rb') as fp:
12     rsrcmgr = pdfinterp.PDFResourceManager()
13     retstr = StringIO()
14     codec = 'utf-8'
15     laparams = layout.LAParams()
16     device = converter.TextConverter(
17         rsrcmgr, retstr, codec=codec, laparams=laparams)
18     # Create a PDF interpreter object.
19     interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
20     # Process each page contained in the document.
21     pages = pdfpage.PDFPage.get_pages(fp)
22     for page in pages:
23         interpreter.process_page(page)
24         data = retstr.getvalue()

 

posted @ 2018-07-12 09:37  Rosen369  阅读(284)  评论(0编辑  收藏  举报