python 分析PDF文件 (基于使用pdf2htmlEX.exe python3.6)

from html.parser import HTMLParser
import json
import re
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from itertools import islice
import subprocess
import os
import shutil




def runApp(command, message=''):
stdoutput = None
erroutput = None
for retryFlag in range(3):
try:
p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
(stdoutput, erroutput) = p.communicate(message.encode(), timeout=30)
break
except Exception as e:
if retryFlag != 2:
continue
else:
raise Exception("Error %s"%command)
return stdoutput, erroutput

class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
self.handledtags = ['div']
self.processState = 0
self.fdata={"meta":{},"item":{},"card":{}}
self.stpdf = False
self.x3 = False
self.lastmeta = ""
self.itemNum = 0
self.cardNum = 0

def _attr(self,attrlist, attrname):
for each in attrlist:
if attrname == each[0]:
return each[1]
return ""

def handle_starttag(self, tag, attrs):
if "page-container" == self._attr(attrs, 'id'):
self.stpdf = True
if "x3" in self._attr(attrs, 'class'):
self.x3 = True



def handle_data(self,data):
if not self.stpdf:
return
if self.processState == 0:
if len(data.split())>0 and data.split()[0].endswith((":",":")):
data = data.split()
lastCon = 0
for i in range(0,len(data)):
if data[i].endswith((":",":")): #判断data结尾(":",":")
lastCon = i
self.lastmeta = data[i][:-1]
else:
self.fdata["meta"][data[lastCon][:-1]] = data[i]
elif "经费项目" in data:
self.fdata["meta"]["项目"] = data[4:]
self.processState = 1
elif self.x3 :
self.x3 = False
if len(self.lastmeta) >0 and self.lastmeta in self.fdata["meta"] and len(self.fdata["meta"][self.lastmeta]) > 0:
self.fdata["meta"][self.lastmeta] += data
elif self.processState == 1:
data = data.split() #字符串切割
if "合计金额(小写):" in data[0]:
self.fdata["item"]["total"] = data[-1]
self.processState = 2
if self.processState == 2:
if "结算信息" in data:
self.processState = 3
if self.processState == 3:
data = data.split()
if data[0] in map(lambda x: str(x), [i for i in range(100)]):
self.fdata["card"][data[0]] = data[1:]
self.cardNum += 1
elif "预约报销日期" in data[0]:
self.fdata["card"]["date"] = " ".join(data[1:])
self.processState = 4

return


# def pdf2csv(pdf):
if __name__ == '__main__':
n=0
# pdfFile = sys.argv[0]
addressPDF = "E:/totally/FinancePDF_travel/"

f_list = os.listdir(addressPDF)

for fileNAME in f_list:
try:
if os.path.splitext(fileNAME)[1] == '.pdf':

pdfFile=addressPDF +fileNAME
pdfFile=pdfFile
#print(pdfFile)
htmlFile = pdfFile[:-4] + ".html"
xlsxFile= pdfFile +".xls"

s,e = runApp('pdf2htmlEX "%s"'%(pdfFile))
try:
html_code = re.sub("<span.+?</span>"," ",open(htmlFile,encoding = "UTF-8").read())
except Exception as e2:
print(e)
print(e2)
hp = MyHTMLParser()
hp.feed(html_code)
hp.close()

# print(json.dumps(hp.fdata,indent=4))


wb = Workbook()

ws = wb.active

title = ["编号", "项目负责人", "项目", "报销事由", "费用合计", "预约报销日期", "结算信息2*", "", ""]
c = iter(range(len(title)))
for i in c:
t = title[i][:-2] if title[i].endswith('*') else title[i]
ws["%s1"% (chr(ord('A')+i))] = t
if title[i][-1] == "*":
crs = int(title[i][-2])
ws.merge_cells("%s1:%s1"%(chr(ord('A')+i),chr(ord('A')+i+crs)))
next(islice(c, crs, crs), None)
# i += crs


MergeBoxNum = max(hp.cardNum,hp.itemNum)
if MergeBoxNum == 0:
MergeBoxNum ==1
else:
c = iter(range(len(title)))
for i in c:
if title[i].endswith("*"):
crs = int(title[i][-2])
next(islice(c, crs, crs), None)
continue
ws.merge_cells("%s2:%s%d"% (chr(ord('A')+i),
chr(ord('A')+i),
1 + MergeBoxNum))

vfunc = [lambda x: x["meta"]["报销单号"],
lambda x: x["meta"]["项目负责人"],
lambda x: x["meta"]["项目"],
lambda x: x["meta"]["报销事由"],
lambda x: x["item"]["total"],
lambda x: x["card"]["date"],
lambda x: [n[1] for n in filter(lambda k:
k[0] in map(lambda x: str(x), [i for i in range(100)])
, x["card"].items())],
lambda x: [n[1] for n in filter(lambda k:
k[0] in map(lambda x: str(x), [i for i in range(100)])
, x["item"].items())]


vfuncID = 0
for i in range(len(title)):
if len(title[i]) == 0:
continue
dat = vfunc[vfuncID](hp.fdata)
if type(dat) == list:
if "结算信息" in title[i]:
for j in range(len(dat)):
if len(dat[j]) == 0:
ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = "null"
elif len(dat[j]) <= 2 and len(dat[j]) > 0 :
ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
else:
ws["%s%d" % (chr(ord('A') + i), 2 + j + 1)] = dat[j][0]
ws["%s%d" % (chr(ord('A') + i + 1), 2 + j + 1)] = dat[j][1]
ws["%s%d" % (chr(ord('A') + i + 2), 2 + j + 1)] = dat[j][2]
ws["%s%d" % (chr(ord('A') + i + 3), 2 + j + 1)] = dat[j][3]
else:
ws["%s2" % (chr(ord('A') + i))] = dat

vfuncID += 1
column_widths = []
for row in ws:
for i, cell in enumerate(row):
if not cell.value:
continue
if len(column_widths) > i:
if len(cell.value) > column_widths[i]:
column_widths[i] = len(cell.value)
else:
column_widths += [len(cell.value)] if cell.value else [0]

for i, column_width in enumerate(column_widths):
ws.column_dimensions[get_column_letter(i + 1)].width = min(42,column_width * 1.7)
wb.save(xlsxFile)
except:
name_OVER=os.path.splitext(fileNAME)
if name_OVER[1] == '.pdf':
n += 1
print(str(n) + '.' + '无法解析' + fileNAME + '文件')
oldname = u"E:\\totally\\FinancePDF_travel\\" + fileNAME
newname = u"E:\\totally\\bad_file\\" + fileNAME
shutil.copyfile(oldname, newname)
print('已复制' + fileNAME + '文件')
continue
else:
continue
 
posted @ 2018-02-05 15:29  Moucong  阅读(1848)  评论(0编辑  收藏  举报