python读取大型Excel文件

前言

python读取Excel文件的库有pandasopenpyxlxlrd等,但是各有优缺点,虽说openpyxl在指定read_only参数后读取大型文件的速度非常快,但是它只适用于xlsx类型文件,且有些银行系统下载的报表不手动打开进行保存它都无法完整读取一行数据。最终基本都会使用pandas读取,也方便后期数据清洗。下面就主要针对pandas版本低于2.2时,无法通过直接指定engine='calamine',但是又想提升读取大型Excel的速度,那么还是可以用python-calamine库实现的,并且速度提升非常显著!

# 安装
pip install python-calamine

2.0<pandas<2.2

import pandas as pd
from python_calamine.pandas import pandas_monkeypatch


pandas_monkeypatch()
df = pd.read_excel(file_path, engine="calamine")
df.head()

pandas<2.0

示例1

import pandas as pd
from python_calamine import CalamineWorkbook


wb = CalamineWorkbook.from_path(filename)
# sht_names = wb.sheet_names    # 获取所有sheet名
# row_list = wb.get_sheet_by_name(sheet).to_python()
row_list = wb.get_sheet_by_index(0).to_python()
df = pd.DataFrame(row_list)
df.columns = df.iloc[0].values
df.drop(0, inplace=True)
df.reset_index(drop=True, inplace=True)
print(df.head())
print(df.shape)

"""
  Column_1 Column_2 Column_3 Column_4  ... Column_17 Column_18 Column_19 Column_20
0     72.0     32.0     34.0     46.0  ...      78.0      68.0      31.0      97.0
1     46.0     85.0     62.0     62.0  ...      30.0      79.0      28.0      94.0
2     96.0     44.0     80.0      2.0  ...      44.0      30.0      39.0       3.0
3     13.0     26.0     25.0     61.0  ...      69.0      57.0      17.0      89.0
4     15.0     76.0     21.0     74.0  ...      64.0      60.0       4.0      88.0

[5 rows x 20 columns]
(120000, 20)
耗时:2.229128837585449
"""

示例2

生成器

import pandas as pd
from python_calamine import CalamineWorkbook


def iter_excel_calamine(file):
    workbook = CalamineWorkbook.from_filelike(file)
    rows = iter(workbook.get_sheet_by_index(0).to_python())
    # headers = list(map(str, next(rows)))
    for row in rows:
        # yield dict(zip(headers, row))
        yield row


tmp_list = []
with open(filename, "rb") as fh:
    for row in iter_excel_calamine(fh):
        tmp_list.append(row)
df = pd.DataFrame(tmp_list)
print(df.head())
print(df.shape)

"""
         0         1         2   ...         17         18         19
0  Column_1  Column_2  Column_3  ...  Column_18  Column_19  Column_20
1      72.0      32.0      34.0  ...       68.0       31.0       97.0
2      46.0      85.0      62.0  ...       79.0       28.0       94.0
3      96.0      44.0      80.0  ...       30.0       39.0        3.0
4      13.0      26.0      25.0  ...       57.0       17.0       89.0

[5 rows x 20 columns]
(120001, 20)
耗时:2.1872622966766357
"""
posted @ 2024-05-24 14:58  cnblogs用户  阅读(169)  评论(0编辑  收藏  举报