小项目:从pdf中抽取数据到excel
从pdf中抽取数据到excel
在日常工作中遇到需要批量处理的pdf表格,并将表格中的数据批量处理至excel中,以水质监测的报告为例
import os
import pdfplumber
import numpy as np
import pandas as pd
import xlwings as xw
def convert_to_letter(number, columnA=0):
"""
数字转字母列号
columnA: 你希望A列是第几列(0 or 1)? 默认0
return: str in upper case
"""
ab = '_ABCDEFGHIJKLMNOPQRSTUVWXYZ'
n = number - columnA
x = n % 26
if n >= 26:
n = int(n / 26)
return convert_to_letter(n, 1) + ab[x + 1]
else:
return ab[x + 1]
def pdf_to_excel(bag_path,excel_path):
#先传入一个excel的模板,既我需要达成的一个模板
"""
将pdf中的数据写入到目标xcel
:param bag_path: 要写入pdf的文件路径
:param excel_path: 目标excel的文件路径
:return: 将一个pdf文件路径写入后的文件路径
"""""
#复制新的文件,
lis_path = os.path.dirname(excel_path)
last_path = os.path.join(lis_path, "此文件为已经追加数据的文件.xls")
df = pd.read_excel(excel_path)
df.to_excel(last_path,index=False) #另存excel为一个新的路径
f_list = os.listdir(bag_path)#处理PDF文件
name_list = [] #储存文件中已有地名或人名的唯一标识符
num = 0 #计数
for path in f_list:
num = num + 1
join_path = os.path.join(bag_path, path)#一个pdf文件
pdf = pdfplumber.open(join_path)
#采样日期的获取
first_page = pdf.pages[0]
lis = first_page.extract_tables()
for i in lis[0]:
if '采(送)样\n日期' in i:
loc = i.index('采(送)样\n日期')
cydata = i[loc + 1]
# print(cydata)
#具体内容(表2内容的获取)
table = []
num_pdf = len(pdf.pages)#获取pdf的页数
for i in range(0, num_pdf):
first_page = pdf.pages[i]
lis = first_page.extract_table()
table = table + lis #对所有页码的数据进行合并
app = xw.App(visible=True, add_book=False) # app创建一个excel
wb = app.books.open(last_path) # book创建工作薄
sht = wb.sheets["Sheet1"]
rng = sht.range('A1').expand('table')
nrows = rng.rows.count # 获取行数
ncols = rng.columns.count # 获取列数
palace_name = table[1][1]
if palace_name in name_list:print("警告!!!,文件%s中----->%s<------与其他文件重复"%(last_path,table[1][1]))#以地名为唯一标识符,对重复文件进行筛选
else:
name_list.append(palace_name)
#根据地名或人名获取其行号
data = pd.read_excel(last_path)
data_list = np.array(data).tolist()
for i in range(len(data_list)):
for j in range(len(data_list[i])):
if data_list[i][j] == palace_name:nrows_str = i+2
# else:
# nrows_str = nrows + 2 # 地名或人名那一行,后续建议利用pandas进行优化
# name = "A" + str(nrows_str)
# sht.range(name).value = palace_name
# print("%s未找到对应的地点或人名,将在列表后空格方生成"%palace_name)
#采样时间
for j in range(ncols + 1): # 根据列数进行查询
j = j + 1
b = sht.cells(1, j).value
if b in ["采样日期时间","检测日期时间"]:
j = j - 1
ncols_str = convert_to_letter(j) # 调用列标函数,将数字与字母进行转换
total_str = ncols_str + str(nrows_str)
sht.range(total_str).value = cydata
for i in table:#获取一个包含数据的列表 print([i[1], i[2], i[-1]])
for j in range(ncols+1): #根据列数进行查询
j = j + 1
b = sht.cells(1, j).value
if b in ["总硬度","铬"] and b in i[2]:
j = j - 1
ncols_str = convert_to_letter(j) # 调用列标函数,将数字与字母进行转换
total_str = ncols_str + str(nrows_str)
sht.range(total_str).value = i[-1]
break
elif b in ["挥发酚类","氯消毒为游离余氯","硝酸盐氮"] and i[2] in b:
# print(b)
j = j - 1
ncols_str = convert_to_letter(j) # 调用列标函数,将数字与字母进行转换
# nrows_str = nrows + 1
total_str = ncols_str + str(nrows_str)
sht.range(total_str).value = i[-1]
break
elif i[2] == "大肠艾希氏菌" and b == "大肠埃希氏菌":#处理报告中出现“大肠艾希氏菌”错别字
j = j - 1
ncols_str = convert_to_letter(j) # 调用列标函数,将数字与字母进行转换
total_str = ncols_str + str(nrows_str)
sht.range(total_str).value = i[-1]
break
elif i[2] == b:
j = j - 1
ncols_str = convert_to_letter(j) #调用列标函数,将数字与字母进行转换
# nrows_str = nrows + 1
total_str = ncols_str + str(nrows_str)
sht.range(total_str).value = i[-1]
break
wb.save()
wb.close()
app.quit()
print("已经完成%s个pdf文件......"%num)
return last_path
def change_data(new_excel_path):
df = pd.read_excel(new_excel_path)
df.loc[df['总大肠菌群'] == '未检出', '总大肠菌群'] = 0
df.loc[df['菌落总数'] == '未检出', '菌落总数'] = 0
df.loc[df['耐热大肠菌群'] == '未检出', '耐热大肠菌群'] = 0
df.loc[df['大肠埃希氏菌'] == '未检出', '大肠埃希氏菌'] = 0
df.loc[df['砷'] == '未检出(<0.001)', '砷'] = -1
df.loc[df['镉'] == '<0.0005', '镉'] = -1
df.loc[df['铅'] == '<0.0025', '铅'] = -1
df.loc[df['汞'] == '未检出(<0.0001)', '汞'] = -1
df.loc[df['硒'] == '未检出(<0.0004)', '硒'] = -1
df.loc[df['氰化物'] == '<0.002', '氰化物'] = -1
df.loc[df['四氯化碳'] == '<0.0001', '四氯化碳'] = -1
df.loc[df['色度'] == '<5', '色度'] = -1
df.loc[df['浑浊度'] == '<0.5', '浑浊度'] = -1
df.loc[df['臭和味'] == '无异臭、异味', '臭和味'] = 0
df.loc[df['肉眼可见物'] == '无', '肉眼可见物'] = 0
df.loc[df['铝'] == '<0.008', '铝'] = -1
df.loc[df['铁'] == '未检出(<0.04)', '铁'] = -1
df.loc[df['锰'] == '未检出(<0.03)', '锰'] = -1
df.loc[df['铜'] == '未检出(<0.02)', '铜'] = -1
df.loc[df['锌'] == '未检出(<0.005)', '锌'] = -1
df.loc[df['阴离子合成洗涤剂'] == '<0.05', '阴离子合成洗涤剂'] = -1
df.loc[df['氨氮'] == '<0.02', '氨氮'] = -1
df.loc[df['铬'] == '<0.004', '铬'] = -1
df.loc[df['挥发酚类'] == '<0.002', '挥发酚类'] = -1
df.to_excel(new_excel_path, index=False)
def change2_data(new_excel_path):
df = pd.read_excel(new_excel_path)
for x in df.columns:
y_list = []
for y in df[x]:
if y in ['未检出', '无异臭、异味','无']:
y = 0
else:
if "<" in str(y):
y = -1
y_list.append(y)
df[x] = y_list
df.to_excel(new_excel_path, index=False)
def run():#主函数
bag_path = input("请输入包含目标pdf文件的上一级文件夹路径:")
excel_path = input("请输入要写入的exce文件路径:")
print("----------------->任务开始,请稍等......<------------------")
new_excel_path = pdf_to_excel(bag_path, excel_path)
a = change_data(new_excel_path)
# a = change2_data(new_excel_path)
print("恭喜您!!!,任务已完成!")
if __name__ == '__main__':
run()
记录学习的点点滴滴