pdfminert提取PDF中文内容
由于PyPDF2提取中文乱码,无法识别。所以使用pdfminer
pdfminer : https://github.com/euske/pdfminer
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams, LTTextBox from pdfminer.converter import PDFPageAggregator import re # Open a PDF file. fp = open('1p.pdf', 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. #document = PDFDocument(parser,password) document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. TXT = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for x in layout: if isinstance(x, LTTextBox): #print(x.get_text().strip()) #strip()去空行 searchObj = re.search(r'(TPP*\d{8})', x.get_text().strip(), flags=0) #找出TPP的单号 if searchObj: TXT.append(searchObj.group()) print(list(set(TXT))[0])