小项目:从pdf中抽取数据到excel

从pdf中抽取数据到excel

在日常工作中遇到需要批量处理的pdf表格,并将表格中的数据批量处理至excel中,以水质监测的报告为例

import os
import pdfplumber
import numpy as np
import pandas as pd
import xlwings as xw

def convert_to_letter(number, columnA=0):
    """
    数字转字母列号
    columnA: 你希望A列是第几列(0 or 1)? 默认0
    return: str in upper case
    """
    ab = '_ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    n = number - columnA
    x = n % 26
    if n >= 26:
        n = int(n / 26)
        return convert_to_letter(n, 1) + ab[x + 1]
    else:
        return ab[x + 1]


def pdf_to_excel(bag_path,excel_path):
    #先传入一个excel的模板,既我需要达成的一个模板
    """
    将pdf中的数据写入到目标xcel
    :param bag_path: 要写入pdf的文件路径
    :param excel_path: 目标excel的文件路径
    :return: 将一个pdf文件路径写入后的文件路径
    """""
    #复制新的文件,
    lis_path = os.path.dirname(excel_path)
    last_path = os.path.join(lis_path, "此文件为已经追加数据的文件.xls")
    df = pd.read_excel(excel_path)
    df.to_excel(last_path,index=False) #另存excel为一个新的路径

    f_list = os.listdir(bag_path)#处理PDF文件
    name_list = [] #储存文件中已有地名或人名的唯一标识符
    num = 0 #计数
    for path in f_list:
        num = num + 1
        join_path = os.path.join(bag_path, path)#一个pdf文件
        pdf = pdfplumber.open(join_path)


        #采样日期的获取
        first_page = pdf.pages[0]
        lis = first_page.extract_tables()
        for i in lis[0]:
            if '采(送)样\n日期' in i:
                loc = i.index('采(送)样\n日期')
                cydata = i[loc + 1]
                # print(cydata)

        #具体内容(表2内容的获取)
        table = []
        num_pdf = len(pdf.pages)#获取pdf的页数
        for i in range(0, num_pdf):
            first_page = pdf.pages[i]
            lis = first_page.extract_table()
            table = table + lis #对所有页码的数据进行合并

        app = xw.App(visible=True, add_book=False)  # app创建一个excel
        wb = app.books.open(last_path)  # book创建工作薄
        sht = wb.sheets["Sheet1"]
        rng = sht.range('A1').expand('table')
        nrows = rng.rows.count  # 获取行数
        ncols = rng.columns.count  # 获取列数

        palace_name = table[1][1]
        if palace_name  in name_list:print("警告!!!,文件%s中----->%s<------与其他文件重复"%(last_path,table[1][1]))#以地名为唯一标识符,对重复文件进行筛选
        else:

            name_list.append(palace_name)

            #根据地名或人名获取其行号
            data = pd.read_excel(last_path)
            data_list = np.array(data).tolist()
            for i in range(len(data_list)):
                for j in range(len(data_list[i])):
                    if data_list[i][j] == palace_name:nrows_str = i+2
                    # else:
                    #     nrows_str = nrows + 2  # 地名或人名那一行,后续建议利用pandas进行优化
                    #     name = "A" + str(nrows_str)
                    #     sht.range(name).value = palace_name
                    #     print("%s未找到对应的地点或人名,将在列表后空格方生成"%palace_name)

            #采样时间
            for j in range(ncols + 1):  # 根据列数进行查询
                j = j + 1
                b = sht.cells(1, j).value
                if b in ["采样日期时间","检测日期时间"]:
                    j = j - 1
                    ncols_str = convert_to_letter(j)  # 调用列标函数,将数字与字母进行转换
                    total_str = ncols_str + str(nrows_str)
                    sht.range(total_str).value = cydata

            for i in table:#获取一个包含数据的列表 print([i[1], i[2], i[-1]])
                for j in range(ncols+1): #根据列数进行查询
                    j = j + 1
                    b = sht.cells(1, j).value
                    if b in ["总硬度","铬"] and b in i[2]:
                        j = j - 1
                        ncols_str = convert_to_letter(j)  # 调用列标函数,将数字与字母进行转换
                        total_str = ncols_str + str(nrows_str)
                        sht.range(total_str).value = i[-1]
                        break
                    elif b in ["挥发酚类","氯消毒为游离余氯","硝酸盐氮"] and i[2] in b:
                        # print(b)
                        j = j - 1
                        ncols_str = convert_to_letter(j)  # 调用列标函数,将数字与字母进行转换
                        # nrows_str = nrows + 1
                        total_str = ncols_str + str(nrows_str)
                        sht.range(total_str).value = i[-1]
                        break
                    elif i[2] == "大肠艾希氏菌" and b == "大肠埃希氏菌":#处理报告中出现“大肠艾希氏菌”错别字
                        j = j - 1
                        ncols_str = convert_to_letter(j)  # 调用列标函数,将数字与字母进行转换
                        total_str = ncols_str + str(nrows_str)
                        sht.range(total_str).value = i[-1]
                        break
                    elif i[2] == b:
                        j = j - 1
                        ncols_str = convert_to_letter(j) #调用列标函数,将数字与字母进行转换
                        # nrows_str = nrows + 1
                        total_str = ncols_str + str(nrows_str)
                        sht.range(total_str).value = i[-1]
                        break
        wb.save()
        wb.close()
        app.quit()

        print("已经完成%s个pdf文件......"%num)
    return last_path

def change_data(new_excel_path):

    df = pd.read_excel(new_excel_path)

    df.loc[df['总大肠菌群'] == '未检出', '总大肠菌群'] = 0
    df.loc[df['菌落总数'] == '未检出', '菌落总数'] = 0
    df.loc[df['耐热大肠菌群'] == '未检出', '耐热大肠菌群'] = 0
    df.loc[df['大肠埃希氏菌'] == '未检出', '大肠埃希氏菌'] = 0
    df.loc[df['砷'] == '未检出(<0.001)', '砷'] = -1
    df.loc[df['镉'] == '<0.0005', '镉'] = -1
    df.loc[df['铅'] == '<0.0025', '铅'] = -1
    df.loc[df['汞'] == '未检出(<0.0001)', '汞'] = -1
    df.loc[df['硒'] == '未检出(<0.0004)', '硒'] = -1
    df.loc[df['氰化物'] == '<0.002', '氰化物'] = -1
    df.loc[df['四氯化碳'] == '<0.0001', '四氯化碳'] = -1
    df.loc[df['色度'] == '<5', '色度'] = -1
    df.loc[df['浑浊度'] == '<0.5', '浑浊度'] = -1
    df.loc[df['臭和味'] == '无异臭、异味', '臭和味'] = 0
    df.loc[df['肉眼可见物'] == '无', '肉眼可见物'] = 0
    df.loc[df['铝'] == '<0.008', '铝'] = -1
    df.loc[df['铁'] == '未检出(<0.04)', '铁'] = -1
    df.loc[df['锰'] == '未检出(<0.03)', '锰'] = -1
    df.loc[df['铜'] == '未检出(<0.02)', '铜'] = -1
    df.loc[df['锌'] == '未检出(<0.005)', '锌'] = -1
    df.loc[df['阴离子合成洗涤剂'] == '<0.05', '阴离子合成洗涤剂'] = -1
    df.loc[df['氨氮'] == '<0.02', '氨氮'] = -1
    df.loc[df['铬'] == '<0.004', '铬'] = -1
    df.loc[df['挥发酚类'] == '<0.002', '挥发酚类'] = -1

    df.to_excel(new_excel_path, index=False)


def change2_data(new_excel_path):
    df = pd.read_excel(new_excel_path)
    for x in df.columns:
        y_list = []
        for y in df[x]:
            if y in ['未检出', '无异臭、异味','无']:
                y = 0
            else:
                if "<" in str(y):
                    y = -1
            y_list.append(y)
        df[x] = y_list
    df.to_excel(new_excel_path, index=False)


def run():#主函数

    bag_path = input("请输入包含目标pdf文件的上一级文件夹路径:")
    excel_path = input("请输入要写入的exce文件路径:")
    print("----------------->任务开始,请稍等......<------------------")
    new_excel_path = pdf_to_excel(bag_path, excel_path)
    a = change_data(new_excel_path)
    # a = change2_data(new_excel_path)


    print("恭喜您!!!,任务已完成!")


if __name__ == '__main__':
    run()




posted @ 2022-09-26 14:38  小杨的冥想课  阅读(289)  评论(0编辑  收藏  举报