py脚本一:业务中excel报表处理

excel报表处理

根据业务需求，每月都要对报表处理一次，手动太麻烦，还容易出错，用py脚本，实现对应处理，能更快；个人需求可能并不符合所有要求

涉及python模块
pandas、numpy、re
代码:

import pandas as pd
import numpy as np
import re
import os

def list_columns(df):
    print("列名标号:")
    for i, col in enumerate(df.columns):
        print(f"{i}: {col}")

def remove_columns_from_excel(df):#1列表删除
    list_columns(df)
    selected_columns = input("请输入要保留的列的标号（以空格分隔）: ").split()
    selected_columns = [int(col) for col in selected_columns]
    return df.iloc[:, selected_columns]

def clean_urls_in_column(df):#2url匹配
    list_columns(df)
    column_index = int(input("请输入URL所在列的标号: "))

    # 正则表达式匹配基本的URL结构
    url_pattern = re.compile(r'^(https?://)?([\w\-\.]+)(\.\w+)(/[\w\-\.]*)*')

    def clean_url(cell):
        if pd.isna(cell) or cell in ['undefined', 'NaN', 'NaN//undefined', '待确认']:
            return ""

        # 确保cell是字符串
        cell = str(cell).strip()

        # 去除两侧的引号
        if cell.startswith('"') and cell.endswith('"'):
            cell = cell[1:-1]

        # 修正分隔符（将逗号、中文逗号等替换为点）
        cell = re.sub(r'[，,]', '.', cell)

        # 使用正则表达式匹配并修复URL
        match = url_pattern.match(cell)
        if match:
            protocol = match.group(1) or "http://"
            domain = match.group(2) + match.group(3)
            path = match.group(4) or ""
            return f"{protocol}{domain}{path}"
        else:
            return ""

    df.iloc[:, column_index] = df.iloc[:, column_index].apply(clean_url)
    return df

def filter_and_delete_rows(df):#3
    list_columns(df)
    column_index = int(input("请输入要操作的列的标号: "))

    # 获取唯一值列表，包括空值
    unique_values = df.iloc[:, column_index].unique()
    value_to_index = {str(i): v for i, v in enumerate(unique_values) if pd.notna(v)}

    # 添加空值选项
    if df.iloc[:, column_index].isnull().any():
        value_to_index[str(len(value_to_index))] = None

    # 列出所有唯一值并标号
    print("可用的筛选条件:")
    for i, value in value_to_index.items():
        display_value = value if value is not None else '空值'
        print(f"{i}: {display_value}")

    # 接收用户输入的标号
    condition_indexes = input("请输入要删除的条件标号(可以输入多个标号，用空格分隔): ").split()

    # 将输入的标号转换为对应的值
    conditions_to_delete = [value_to_index.get(idx) for idx in condition_indexes if idx in value_to_index]

    # 筛选并删除符合条件的行，包括空值
    df = df[~df.iloc[:, column_index].isin(conditions_to_delete) & ~df.iloc[:, column_index].isnull()]

    return df

def search_and_delete(df):#4
    list_columns(df)
    print("选择删除方式:")
    print("1: 删除整行")
    print("2: 仅删除单元格内容")
    delete_option = int(input("请输入选项: "))

    search_terms = input("请输入要检索的内容(多个内容请用空格分隔): ").split()

    if delete_option == 1:
        df = df[~df.apply(lambda row: any(term in str(cell) for term in search_terms for cell in row), axis=1)]
    elif delete_option == 2:
        def clean_cell(cell):
            cell_str = str(cell)
            for term in search_terms:
                if term in cell_str:
                    cell_str = cell_str.replace(term, "")
            return cell_str.strip()
        
        df = df.applymap(clean_cell)

    return df


def replace_chinese_symbols(df):#5
    # 定义中文符号和对应的英文符号
    replacements = {
        '，': ',',
        '。': '.',
        '！': '!',
        '？': '?',
        '：': ':',
        '；': ';',
        '（': '(',
        '）': ')',
        '【': '[',
        '】': ']',
        '《': '<',
        '》': '>',
        '、': ',',
        '——': '-'
    }

    def replace_symbols(cell):
        if pd.isna(cell) or cell == "":
            return cell  # 保持空白或 NaN

        if isinstance(cell, str):
            for ch, repl in replacements.items():
                cell = cell.replace(ch, repl)
        return cell

    df = df.applymap(replace_symbols)
    return df
	
def remove_duplicate_urls(df):
    list_columns(df)
    column_index = int(input("请输入要去重的列的标号: "))

    # 获取指定列的数据
    column_data = df.iloc[:, column_index]

    # 创建一个新的列表来存储处理后的数据
    cleaned_column = []
    seen = set()

    for cell in column_data:
        if pd.notna(cell):
            # 拆分内容为列表并去重
            urls = cell.split('; ')
            unique_urls = []
            for url in urls:
                if url not in seen:
                    seen.add(url)
                    unique_urls.append(url)
            # 如果该单元格中有唯一的URL，保留它们，否则置空
            if unique_urls:
                cleaned_column.append('; '.join(unique_urls))
            else:
                cleaned_column.append("")
        else:
            cleaned_column.append(cell)

    # 更新DataFrame的指定列
    df.iloc[:, column_index] = cleaned_column

    return df
	
def generate_sub_table_in_same_file(df, writer):
    list_columns(df)
    
    selected_columns = input("请输入要保留的列的标号（以空格分隔）: ").split()
    selected_columns = [int(col) for col in selected_columns]

    # 添加是否进行筛选的选项
    apply_filter = input("是否要进行筛选? (y/n): ").strip().lower()

    if apply_filter == 'y':
        filter_column_index = int(input("请输入要根据哪列进行筛选的列标号: "))
        # 使用 fillna 将 NaN 替换为字符串标识符，以便用户选择
        unique_values = df.iloc[:, filter_column_index].fillna("空值").unique()

        print("筛选条件标号:")
        for i, value in enumerate(unique_values):
            print(f"{i}: {value}")
        condition_index = int(input("请输入筛选的条件标号: "))
        condition_value = unique_values[condition_index]

        # 选择筛选并删除还是筛选并保留
        filter_option = input("请选择操作: 1 - 筛选并删除, 2 - 筛选并保留: ").strip()

        # 处理筛选条件，包括空值
        if condition_value == "空值":
            mask = df.iloc[:, filter_column_index].isna()
        else:
            mask = df.iloc[:, filter_column_index] == condition_value

        if filter_option == '1':
            df_sub = df[~mask]
        elif filter_option == '2':
            df_sub = df[mask]
        else:
            print("无效选项，默认筛选并删除")
            df_sub = df[~mask]
    else:
        # 不进行筛选，直接保留所有行
        df_sub = df.copy()

    # 无论是否筛选，保留用户选择的列
    df_sub = df_sub.iloc[:, selected_columns]

    new_sheet_name = input("请输入新表的名称: ")
    df_sub.to_excel(writer, sheet_name=new_sheet_name, index=False)


def split_table_in_half(df, writer):
    # 计算总行数并计算出中点
    total_rows = len(df)
    midpoint = total_rows // 2
    
    # 确保第一部分和第二部分数据的数量
    first_half = df.iloc[:midpoint, :]
    second_half = df.iloc[midpoint:, :]

    # 获取用户输入的新表名称
    first_sheet_name = input("请输入第一部分表的名称: ")
    second_sheet_name = input("请输入第二部分表的名称: ")

    # 保存两部分数据到新的表中
    first_half.to_excel(writer, sheet_name=first_sheet_name, index=False)
    second_half.to_excel(writer, sheet_name=second_sheet_name, index=False)

    print(f"表已分割，并保存为 '{first_sheet_name}' 和 '{second_sheet_name}' 两个新表。")

def append_data_from_excel_b_to_a(df_a, df_b):
    # 获取A和B表的列名
    columns_a = df_a.columns
    columns_b = df_b.columns

    # 创建一个包含所有A表列的空值DataFrame
    empty_df = pd.DataFrame(columns=columns_a)

    # 遍历B表的列，如果列在A表中存在，则将数据添加到A表对应的列，否则添加空值列
    for col in columns_a:
        if col in columns_b:
            empty_df[col] = df_b[col]
        else:
            empty_df[col] = pd.NA  # 添加空值列

    # 将处理后的B表数据追加到A表
    df_a = pd.concat([df_a, empty_df], ignore_index=True)

    return df_a


def main():
    while True:
        file_path = input("请输入Excel文件路径: ")
        xls = pd.ExcelFile(file_path)
        
        # 如果文件中有多个表，让用户选择要使用的表
        if len(xls.sheet_names) > 1:
            print("请选择要使用的表:")
            for i, sheet_name in enumerate(xls.sheet_names):
                print(f"{i}: {sheet_name}")
            sheet_index = int(input("请输入表的标号: "))
            df = pd.read_excel(xls, sheet_name=xls.sheet_names[sheet_index])
        else:
            df = pd.read_excel(xls)

        with pd.ExcelWriter(file_path, mode='a', engine='openpyxl') as writer:
            print("\n请选择要执行的操作:")
            print("0: 用户添加")
            print("1: 列表删除")
            print("2: URL格式识别与删除")
            print("3: 对用户指定列筛选并删除筛出的行")
            print("4: 指定内容检索并删除")
            print("5: 替换所有中文符号为英文")
            print("6: 域名去重")
            print("7: 附表生成")
            print("8: 表格对半分割")

            choice = int(input("请输入选项: "))

            if choice == 0:
                file_b_path = input("请输入Excel B文件路径: ")
                df_b = pd.read_excel(file_b_path)
                df = append_data_from_excel_b_to_a(df, df_b)
            elif choice == 1:
                df = remove_columns_from_excel(df)
            elif choice == 2:
                df = clean_urls_in_column(df)
            elif choice == 3:
                df = filter_and_delete_rows(df)
            elif choice == 4:
                df = search_and_delete(df)
            elif choice == 5:
                df = replace_chinese_symbols(df)
            elif choice == 6:
                df = remove_duplicate_urls(df)
            elif choice == 7:
                generate_sub_table_in_same_file(df, writer)
            elif choice == 8:
                split_table_in_half(df, writer)
            else:
                print("无效选项")
                return

            if choice not in [7, 8]:
                # 生成新的文件名，如果存在同名文件则追加数字后缀
                base_name, ext = os.path.splitext(file_path)
                output_path = base_name + f'_output.xlsx'
                counter = 1
                while os.path.exists(output_path):
                    output_path = base_name + f'_output_{counter}.xlsx'
                    counter += 1

                df.to_excel(output_path, index=False)
                print(f"操作后的Excel文件已保存为: {output_path}")
        
        continue_choice = input("是否继续执行其他操作？(y/n): ").strip().lower()
        if continue_choice != 'y':
            print("程序结束。")
            break



if __name__ == "__main__":
    main()

功能0:用户添加

使用场景，A表多个表头与参数，需向A表补充B表中记录的信息，且B表信息不一定全，B表中内容补充到A表末尾

功能1:列表删除

使用场景:列出所有表头，按需求输入要保留的列名，其他的删除，且按照输入的列名顺序对剩下列排序

功能2:URL格式识别与删除

适用场景:对指定列，进行url正则匹配，删除不是url格式的内容，并对只有域名的进行补充完整协议http://

功能3:对用户指定列筛选并删除筛出的行

适用场景:列出所有列名，选择要筛的列，分类并列出对应列的内容，对指定内容进行删除(会删除整行数据)

功能4: 指定内容检索并删除

使用场景:在表格中指定内容进行删选删除，一种为仅删除该内容，另一种为删除整行内容

功能5: 替换所有中文符号为英文

使用场景:替换符号

功能6: 域名去重

使用场景:去除某列中重复的域名

功能7: 附表生成

使用场景:选择要保留的列，选择是要根据哪列进行筛选，是筛选并删除还是筛选并保留，之后生成一张新表。

功能8: 表格对半分割

使用场景:对指定表格中的内容进行分割，方便后续处理

posted @ 2024-10-10 11:40 客服小祥-10086 阅读(20) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· Markdown原始语法个人记录

· windows应急响应(三)

· python表格处理--1

· Python 操作 Excel

· 标题：Python脚本：将Excel文件拆分成多个工作表

阅读排行：
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型，支持深度思考和联网搜索！
· 使用C#创建一个MCP客户端
· ollama系列1：轻松3步本地部署deepseek，普通电脑可用
· 基于 Docker 搭建 FRP 内网穿透开源项目（很简单哒）
· 按钮权限的设计及实现

公告

昵称：客服小祥-10086
园龄： 2年4个月
粉丝： 0
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

daydaydream

生年不满百，常怀千岁忧。昼短苦夜长，何不秉烛游

py脚本一:业务中excel报表处理

excel报表处理

公告

搜索

常用链接

我的标签

合集

随笔分类

随笔档案

阅读排行榜

daydaydream

生年不满百，常怀千岁忧。 昼短苦夜长，何不秉烛游

py脚本一:业务中excel报表处理

excel报表处理

公告

搜索

常用链接

我的标签

合集

随笔分类

随笔档案

阅读排行榜

生年不满百，常怀千岁忧。昼短苦夜长，何不秉烛游