python 分析PDF文件（基于使用pdf2htmlEX.exe python3.6）

from html.parser import HTMLParser
import json
import re
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from itertools import islice
import subprocess
import os
import shutil




def runApp(command, message=''):
    stdoutput = None
    erroutput = None
    for retryFlag in range(3):
        try:
            p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            (stdoutput, erroutput) = p.communicate(message.encode(), timeout=30)
            break
        except Exception as e:
            if retryFlag != 2:
                continue
            else:
                raise Exception("Error %s"%command)
    return stdoutput, erroutput

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []
        self.handledtags = ['div']
        self.processState = 0
        self.fdata={"meta":{},"item":{},"card":{}}
        self.stpdf = False
        self.x3 = False
        self.lastmeta = ""
        self.itemNum = 0
        self.cardNum = 0

    def _attr(self,attrlist, attrname):
        for each in attrlist:
            if attrname == each[0]:
                return each[1]
        return ""

    def handle_starttag(self, tag, attrs):
        if "page-container" == self._attr(attrs, 'id'):
            self.stpdf = True
        if "x3" in self._attr(attrs, 'class'):
            self.x3 = True



    def handle_data(self,data):
        if not self.stpdf:
            return
        if self.processState == 0:
            if len(data.split())>0 and data.split()[0].endswith((":","：")):
                data = data.split()
               lastCon = 0
        for i in range(0,len(data)):
            if data[i].endswith((":","：")):   #判断data结尾(":","：")
                lastCon = i
                self.lastmeta = data[i][:-1]
            else:
                self.fdata["meta"][data[lastCon][:-1]] = data[i]
    elif "经费项目" in data:
        self.fdata["meta"]["项目"] = data[4:]
        self.processState = 1
    elif self.x3 :
        self.x3 = False
        if len(self.lastmeta) >0 and self.lastmeta in self.fdata["meta"] and len(self.fdata["meta"][self.lastmeta]) > 0:
            self.fdata["meta"][self.lastmeta] += data
elif self.processState == 1:
    data = data.split()       #字符串切割

if "合计金额(小写)：" in data[0]:
        self.fdata["item"]["total"] = data[-1]
        self.processState = 2
if self.processState == 2:
    if "结算信息" in data:
        self.processState = 3
if self.processState == 3:
    data = data.split()
    if data[0] in map(lambda x: str(x), [i for i in range(100)]):
        self.fdata["card"][data[0]] = data[1:]
        self.cardNum += 1
    elif "预约报销日期" in data[0]:
        self.fdata["card"]["date"] = " ".join(data[1:])
        self.processState = 4

return

# def pdf2csv(pdf):
if __name__ == '__main__':
   n=0
    # pdfFile = sys.argv[0]
   addressPDF = "E:/totally/FinancePDF_travel/"

   f_list = os.listdir(addressPDF)

for fileNAME in f_list:
 try:
    if os.path.splitext(fileNAME)[1] == '.pdf':

     pdfFile=addressPDF +fileNAME
     pdfFile=pdfFile
     #print(pdfFile)
     htmlFile = pdfFile[:-4] + ".html"
     xlsxFile= pdfFile +".xls"

    s,e = runApp('pdf2htmlEX "%s"'%(pdfFile))
    try:
        html_code = re.sub("<span.+?</span>","  ",open(htmlFile,encoding = "UTF-8").read())
    except Exception as e2:
        print(e)

print(e2)
hp = MyHTMLParser()
hp.feed(html_code)
hp.close()

# print(json.dumps(hp.fdata,indent=4))


wb = Workbook()

ws = wb.active

title = ["编号", "项目负责人", "项目", "报销事由", "费用合计", "预约报销日期", "结算信息2*", "", ""]
c = iter(range(len(title)))
for i in c:
    t = title[i][:-2] if title[i].endswith('*') else title[i]
    ws["%s1"% (chr(ord('A')+i))] = t
    if title[i][-1] == "*":
        crs = int(title[i][-2])
        ws.merge_cells("%s1:%s1"%(chr(ord('A')+i),chr(ord('A')+i+crs)))
        next(islice(c, crs, crs), None)
        # i += crs

MergeBoxNum = max(hp.cardNum,hp.itemNum)
if MergeBoxNum == 0:
    MergeBoxNum ==1
else:
  c = iter(range(len(title)))
  for i in c:
    if title[i].endswith("*"):
        crs = int(title[i][-2])
        next(islice(c, crs, crs), None)
        continue
    ws.merge_cells("%s2:%s%d"% (chr(ord('A')+i),
                                chr(ord('A')+i),
                                1 + MergeBoxNum))

vfunc = [lambda x: x["meta"]["报销单号"],
       lambda x: x["meta"]["项目负责人"],
       lambda x: x["meta"]["项目"],
       lambda x: x["meta"]["报销事由"],
       lambda x: x["item"]["total"],
       lambda x: x["card"]["date"],
       lambda x: [n[1] for n in filter(lambda k:
                                         k[0] in map(lambda x: str(x), [i for i in range(100)])
                                         , x["card"].items())],
       lambda x: [n[1] for n in filter(lambda k:
                                              k[0] in map(lambda x: str(x), [i for i in range(100)])
                                              , x["item"].items())]

vfuncID = 0
for i in range(len(title)):
    if len(title[i]) == 0:
        continue
    dat = vfunc[vfuncID](hp.fdata)
    if type(dat) == list:
      if "结算信息" in  title[i]:
            for j in range(len(dat)):
             if len(dat[j]) == 0:
                 ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = "null"
             elif len(dat[j]) <= 2 and len(dat[j]) > 0 :
                 ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
                 ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
             else:
                 ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
                 ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
                 ws["%s%d" % (chr(ord('A') + i + 2), 2 + j + 1)] = dat[j][2]
                 ws["%s%d" % (chr(ord('A') + i + 3), 2 + j + 1)] = dat[j][3]
    else:
        ws["%s2" % (chr(ord('A') + i))] = dat

    vfuncID += 1
column_widths = []
for row in ws:
    for i, cell in enumerate(row):
        if not  cell.value:
            continue
        if len(column_widths) > i:
            if len(cell.value) > column_widths[i]:
                column_widths[i] = len(cell.value)
        else:
            column_widths += [len(cell.value)] if cell.value else [0]

for i, column_width in enumerate(column_widths):
    ws.column_dimensions[get_column_letter(i + 1)].width = min(42,column_width * 1.7)

wb.save(xlsxFile)
except:
  name_OVER=os.path.splitext(fileNAME)
  if name_OVER[1]  == '.pdf':
    n += 1
    print(str(n) + '.' + '无法解析' + fileNAME + '文件')
    oldname = u"E:\\totally\\FinancePDF_travel\\" + fileNAME
    newname = u"E:\\totally\\bad_file\\" + fileNAME
    shutil.copyfile(oldname, newname)
    print('已复制' + fileNAME + '文件')
    continue
  else:
    continue

posted @ 2018-02-05 15:29 Moucong 阅读(1848) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Moucong

python 分析PDF文件 （基于使用pdf2htmlEX.exe python3.6）

公告

python 分析PDF文件（基于使用pdf2htmlEX.exe python3.6）