python处理两个表中,筛选具有相同列值的数据

  1 import openpyxl
  2 import pandas as pd
  3 import os
  4 
  5 
  6 # 导入文件夹的所有文件
  7 def get_files_name():
  8     """
  9     用于获取文件名
 10     :return: 返回值为文件名组成的列表
 11     """
 12     file_list = os.listdir ('./data')
 13     return file_list
 14 
 15 
 16 # 选择要处理的excel表
 17 def load_data(file_list):
 18     for item_file in file_list:
 19         # 获取表格文件
 20         file_name = './data/' + item_file
 21         print('要处理的文件是:'+file_name)
 22         handle (file_name)
 23 
 24 
 25 # Excel数据处理
 26 def handle(file_name):
 27     try:
 28         with pd.ExcelFile (file_name) as xlsx:
 29             # 读取包含带Pandas的公式的excel单元格值
 30             workbook = openpyxl.load_workbook (filename=xlsx, data_only=True)
 31             # 工作表A
 32             sheetA = pd.read_excel (workbook, sheet_name="A", engine='openpyxl')
 33             sheetA.drop_duplicates (subset='stu_no', keep='first', inplace=True)
 34             data_listA = sheetA['stu_no'].values.tolist ()  # 将某一列读到列表中
 35             # print("data_listA:")
 36             # print(data_listA)
 37             # 工作表B
 38             sheetB = pd.read_excel (workbook, sheet_name="B", engine='openpyxl')
 39             sheetB.drop_duplicates (subset='stu_no', keep='first', inplace=True)
 40             data_listB = sheetB['stu_no'].values.tolist ()  # 将某一列读到列表中
 41             # print ("data_listA:")
 42             # print (data_listA)
 43             # 取两个工作表中,指定列的相同数据
 44             same_data = [x for x in data_listB if x in data_listA]
 45             # 要处理的列名
 46             need_col = pd.read_excel (xlsx, sheet_name="A", index_col='trade_code')
 47             # print ("need_col:")
 48             # print (need_col)
 49             # 定义一个空列表
 50             need_data = []
 51             # for 处理两个工作表中的数据源
 52             for item_same in same_data:
 53                 item_list = need_col.loc[[item_same]]
 54                 need_data.append (item_list)
 55             print ("need_data:")
 56             print (need_data)
 57             # 合并查出来的DataFrame的所有数据
 58             need_data = pd.concat (need_data)
 59             print("need_data合并后的:")
 60             print(need_data)
 61             # 定义一个writer
 62             writer = pd.ExcelWriter (file_name, engine='openpyxl', mode='a', if_sheet_exists='replace')
 63             # 接下来还是调用to_excel, 但是第一个参数不再是文件名, 而是上面的writer,将DataFrame写进去
 64             need_data.to_excel (writer, sheet_name="same")
 65             # 保存并关闭writer, 写入磁盘
 66             writer.close ()
 67             print("文件处理完成")
 68     except Exception as e:
 69         print(e)
 70         pass
 71 
 72 
 73 if __name__ == "__main__":
 74     # 导入文件
 75     files = get_files_name ()
 76     data = {}
 77     print ("当前data文件夹下的文件如下:")
 78     num = 1
 79     for file in files:
 80         print (num, file)
 81         num += 1
 82 
 83     choice_file_list = []
 84     while 1:
 85         index_str = input (
 86             "请选择需要处理的文件序号(多个文件导入时用空格分开,输入0则导入所有文件,输入多文件则自动合并):")
 87         index_list = index_str.split (' ')
 88         try:
 89             index_list.remove ('')
 90         except:
 91             pass
 92         if index_list[0] == '0':
 93             choice_file_list = files
 94             break
 95         else:
 96             try:
 97                 for item in index_list:
 98                     choice_file_list.append (files[int (item) - 1])
 99             except:
100                 print ("输入序号有误")
101                 continue
102         if choice_file_list:
103             break
104         else:
105             print ("输入序号有误")
106     load_data (choice_file_list)
View Code

 

posted @ 2023-03-06 16:46  爱家家的卡卡  阅读(161)  评论(0编辑  收藏  举报