python处理两个表中,筛选具有相同列值的数据
1 import openpyxl 2 import pandas as pd 3 import os 4 5 6 # 导入文件夹的所有文件 7 def get_files_name(): 8 """ 9 用于获取文件名 10 :return: 返回值为文件名组成的列表 11 """ 12 file_list = os.listdir ('./data') 13 return file_list 14 15 16 # 选择要处理的excel表 17 def load_data(file_list): 18 for item_file in file_list: 19 # 获取表格文件 20 file_name = './data/' + item_file 21 print('要处理的文件是:'+file_name) 22 handle (file_name) 23 24 25 # Excel数据处理 26 def handle(file_name): 27 try: 28 with pd.ExcelFile (file_name) as xlsx: 29 # 读取包含带Pandas的公式的excel单元格值 30 workbook = openpyxl.load_workbook (filename=xlsx, data_only=True) 31 # 工作表A 32 sheetA = pd.read_excel (workbook, sheet_name="A", engine='openpyxl') 33 sheetA.drop_duplicates (subset='stu_no', keep='first', inplace=True) 34 data_listA = sheetA['stu_no'].values.tolist () # 将某一列读到列表中 35 # print("data_listA:") 36 # print(data_listA) 37 # 工作表B 38 sheetB = pd.read_excel (workbook, sheet_name="B", engine='openpyxl') 39 sheetB.drop_duplicates (subset='stu_no', keep='first', inplace=True) 40 data_listB = sheetB['stu_no'].values.tolist () # 将某一列读到列表中 41 # print ("data_listA:") 42 # print (data_listA) 43 # 取两个工作表中,指定列的相同数据 44 same_data = [x for x in data_listB if x in data_listA] 45 # 要处理的列名 46 need_col = pd.read_excel (xlsx, sheet_name="A", index_col='trade_code') 47 # print ("need_col:") 48 # print (need_col) 49 # 定义一个空列表 50 need_data = [] 51 # for 处理两个工作表中的数据源 52 for item_same in same_data: 53 item_list = need_col.loc[[item_same]] 54 need_data.append (item_list) 55 print ("need_data:") 56 print (need_data) 57 # 合并查出来的DataFrame的所有数据 58 need_data = pd.concat (need_data) 59 print("need_data合并后的:") 60 print(need_data) 61 # 定义一个writer 62 writer = pd.ExcelWriter (file_name, engine='openpyxl', mode='a', if_sheet_exists='replace') 63 # 接下来还是调用to_excel, 但是第一个参数不再是文件名, 而是上面的writer,将DataFrame写进去 64 need_data.to_excel (writer, sheet_name="same") 65 # 保存并关闭writer, 写入磁盘 66 writer.close () 67 print("文件处理完成") 68 except Exception as e: 69 print(e) 70 pass 71 72 73 if __name__ == "__main__": 74 # 导入文件 75 files = get_files_name () 76 data = {} 77 print ("当前data文件夹下的文件如下:") 78 num = 1 79 for file in files: 80 print (num, file) 81 num += 1 82 83 choice_file_list = [] 84 while 1: 85 index_str = input ( 86 "请选择需要处理的文件序号(多个文件导入时用空格分开,输入0则导入所有文件,输入多文件则自动合并):") 87 index_list = index_str.split (' ') 88 try: 89 index_list.remove ('') 90 except: 91 pass 92 if index_list[0] == '0': 93 choice_file_list = files 94 break 95 else: 96 try: 97 for item in index_list: 98 choice_file_list.append (files[int (item) - 1]) 99 except: 100 print ("输入序号有误") 101 continue 102 if choice_file_list: 103 break 104 else: 105 print ("输入序号有误") 106 load_data (choice_file_list)