使用PyPDF2结合pdfminer拆分PDF,并提取关键字重命名拆分出来的文件

需求:银行汇款回单PDF几十页,每一页包含两个回单。需把每一张回单拆分出来,并且以回单上交易附言处TPPXXXXXXXX格式的流水号重命名拆出来的文件。

思路:

1.使用PyPDF2把每一页一分为二,输出PDF到一个目录A。

2.循环目录A,使用pdfminer提取TPPXXXXXXXX格式的流水号,重命名PDF文件。

3.使用pyinstaller -F 打包成一个exe文件。(注意:要在C盘打包)CMD: C:\Users\<用户名>\PDF>pyinstaller -F C:\Users\chende\PDF\pdftools.py 

# -*- coding: UTF-8 -*-  
from PyPDF2 import PdfFileReader, PdfFileWriter

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox
from pdfminer.converter import PDFPageAggregator

import re
import os
import os.path

#切割PDF
def split_pdf(infile, out_path):

    """
    :param infile: 待拆分的pdf文件
    :param out_path: 拆分成单页的pdf文件的存储路径  
    :return: 无
    """

    if not os.path.exists(out_path):
        os.makedirs(out_path)
    with open(infile, 'rb') as infile:

        pdfReader = PdfFileReader(infile)
        number_of_pages = pdfReader.getNumPages()  #计算此PDF文件中的页数
        for i in range(number_of_pages):

            page = pdfReader.getPage(i)
            width = float(page.mediaBox.getWidth())
            height = float(page.mediaBox.getHeight())
            #print("width:"+str(width)+" Height"+str(height))

            #top page
            pdfReader=PdfFileReader(infile)   #一定要重新读取,要不会报错。
            pdfWriter = PdfFileWriter()    #循环创建空白的pdf 
            page_top = pdfReader.getPage(i)
            page_top.mediaBox.lowerLeft = (0,height/2)
            page_top.mediaBox.lowerRight = (width,height/2)
            page_top.mediaBox.upperLeft = (0,height)
            page_top.mediaBox.upperRight = (width,height)
            pdfWriter.addPage(page_top)
            out_file_name = out_path + str(i+1)+'_top.pdf'
            with open(out_file_name, 'wb') as outfile:
                pdfWriter.write(outfile) 
   
            #bottom page
            pdfReader=PdfFileReader(infile)   #一定要重新读取,要不会报错。
            pdfWriter = PdfFileWriter()    #循环创建空白的pdf
            page_buttom = pdfReader.getPage(i)
            page_buttom.mediaBox.lowerLeft = (0,0)
            page_buttom.mediaBox.lowerRight = (width,0)
            page_buttom.mediaBox.upperLeft = (0,height/2)
            page_buttom.mediaBox.upperRight = (width,height/2)
    
            pdfWriter.addPage(page_buttom)   
            out_file_name = out_path + str(i+1)+'_bottom.pdf' 
            with open(out_file_name, 'wb') as outfile:
                pdfWriter.write(outfile) 

    infile.close()
    outfile.close()

#重命名PDF
def extractPDF(out_Path):
    for parent,dirnames,filenames in os.walk(out_Path):     #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
        for filename in filenames:     
            #print(filename)                 #文件名
            #os.rename(os.path.join(parent,filename),os.path.join(parent,filename[:-4]+'.black.png')) #重命名

            # Open a PDF file.
            fp = open(out_Path + filename, 'rb')
            # Create a PDF parser object associated with the file object.
            parser = PDFParser(fp)
            # Create a PDF document object that stores the document structure.
            # Supply the password for initialization.
            #document = PDFDocument(parser,password)
            document = PDFDocument(parser)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            # Create a PDF resource manager object that stores shared resources.
            rsrcmgr = PDFResourceManager()
            # Create a PDF device object.
            device = PDFDevice(rsrcmgr)
            # Set parameters for analysis.
            laparams = LAParams()
            # Create a PDF page aggregator object.
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.
            TXT = []
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                # receive the LTPage object for the page.
                layout = device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBox):
                        #print(x.get_text().strip())    #strip()去空行
                        searchObj = re.search(r'(TPP*\d{8})', x.get_text().strip(), flags=0)
                        if searchObj:
                            print(searchObj.group())
                            TXT.append(searchObj.group())
            fp.close() #注意要关闭,要不报错
            device.close()

            if TXT:
                #print(list(set(TXT))[0])
                NewFileName = list(set(TXT))[0]
                os.rename(out_Path + filename,out_Path + NewFileName+".pdf") #重命名 


if __name__ == '__main__':
    in_File = './PDFfile.pdf'
    out_Path = './Single/'  # 生成输出文件夹
    split_pdf(in_File, out_Path)                             
    extractPDF(out_Path)    # 指明被遍历的文件夹
    

 

posted @ 2020-04-15 22:39  活捉火星人  阅读(1461)  评论(0编辑  收藏  举报