狂自私

导航

python-合并大型Excel

使用openpyxl库实在是太慢,使用openpyxl的只读模式又需要excel的准确尺寸信息,但是每个excel的尺寸都不一样。

我使用zipfile解压excel文件,直接读取xml文件进行合并:

import os
from openpyxl import load_workbook
from openpyxl import Workbook
import zipfile
import re

total_row_number=0  #当此值等于0时,需要写入第一行列头

def 获取行数和列数(path:str)->tuple:
    '''
    返回excel中的第一个sheet页中的数据、第一个sheet的总行数和总列数
    '''
    zip_file= zipfile.ZipFile(path)
    zip_list = zip_file.namelist() # 得到压缩包里所有文件
    for zip_path in zip_list:
        if('xl/worksheets/sheet1.xml' == zip_path):
            filepath = zip_file.extract(zip_path)
            lines=None
            row_number=0
            col_number=0
            result=[]   #形成二维列表
            monitor_typeName = os.path.basename(path).split('-')[0]
            with open(filepath,'r',encoding='utf-8') as in_fd:
                lines = in_fd.readlines()
            for line in lines:
                if('<row r=' == line[:7]):
                    #
                    row_number+=1
                elif('<c r=' == line[:5]):
                    #
                    regex_str= r'<c.+?/c>'
                    col_list = re.findall(regex_str,line)
                    col_number=max(col_number,len(col_list))
                    # t=[re.findall(r'(?<=<t>).+?(?=</t>)|(?<=<v>).+?(?=</v>)',col)[0] for col in col_list]
                    t=[re.sub(r'<.+?>',"",col) for col in col_list]
                    result.append([monitor_typeName]+t)
            result[0][0]='监测类型' #修正下列头
            os.remove(filepath)
            zip_file.close()
            return (result,row_number,col_number);

basic_path ='D:\\BaiduSyncdisk\\temp\\2024年3月4日\\导出所有类型的变电台账数据\\'
files = [f for f in os.listdir(basic_path) if f.endswith(".xlsx")]

def 写入到excel中()->None:
    #速度太慢了。
    global files,basic_path
    out_workbook = Workbook(write_only=True)
    out_sheet=out_workbook.create_sheet()
    for file in files:
        path="{}{}".format(basic_path,file)
        if('明细合并结果' in path):continue
        data_2list,row_number,col_number=获取行数和列数(path)
        if(0!=total_row_number):
            data_2list=data_2list[1:]   #去掉猎头
        for line in data_2list:
            out_sheet.append(line)
        total_row_number+=row_number
        print("{} 以及合并完成。".format(file))
    out_workbook.save('{}{}'.format(basic_path,"明细合并结果.xlsx"))    #只写模式只能保存一次
    out_workbook.close();
def 写入到csv文件中():
    global files,basic_path,total_row_number
    with open("{}明细合并结果.csv".format(basic_path),'w',encoding='utf-8') as out_fd:
        for file in files:
            path="{}{}".format(basic_path,file)
            if('明细合并结果' in path):continue
            data_2list,row_number,col_number=获取行数和列数(path)
            if(0!=total_row_number):
                data_2list=data_2list[1:]   #去掉猎头
            str_list=['\t'.join(data) for data in data_2list]
            out_str='\n'.join(str_list)
            print(out_str,file=out_fd)
            total_row_number+=row_number
            print("{} 以及合并完成。".format(file));
if __name__ == "__main__":
    写入到csv文件中()

 

posted on 2024-03-06 11:51  狂自私  阅读(10)  评论(0编辑  收藏  举报