2018.03.27 python pandas merge join 使用
#2.16 合并 merge-join import numpy as np import pandas as pd df1 = pd.DataFrame({'key1':['k0','k1','k2','k3'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) df2 = pd.DataFrame({'key1':['k0','k1','k2','k3'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}) df3 = pd.DataFrame({'key1':['k0','k1','k2','k3'], 'key2':['k0','k1','k0','k1'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) df4 = pd.DataFrame({'key1':['k0','k1','k2','k3'], 'key2':['k0','k1','k0','k3'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}) print(df1) print(df2) print(pd.merge(df1,df2,on='key1')) print('------') #on 参考键 print(df3) print(df4) print(pd.merge(df3,df4,on=['key1','key2']))#2个键值同时去匹配
结果:
A B key1 0 A0 B0 k0 1 A1 B1 k1 2 A2 B2 k2 3 A3 B3 k3 C D key1 0 C0 D0 k0 1 C1 D1 k1 2 C2 D2 k2 3 C3 D3 k3 A B key1 C D 0 A0 B0 k0 C0 D0 1 A1 B1 k1 C1 D1 2 A2 B2 k2 C2 D2 3 A3 B3 k3 C3 D3 ------ A B key1 key2 0 A0 B0 k0 k0 1 A1 B1 k1 k1 2 A2 B2 k2 k0 3 A3 B3 k3 k1 C D key1 key2 0 C0 D0 k0 k0 1 C1 D1 k1 k1 2 C2 D2 k2 k0 3 C3 D3 k3 k3 A B key1 key2 C D 0 A0 B0 k0 k0 C0 D0 1 A1 B1 k1 k1 C1 D1 2 A2 B2 k2 k0 C2 D2
#参数how 合并方式 print(pd.merge(df3,df4,on=['key1','key2'],how='inner'))#取交集 print(pd.merge(df3,df4,on=['key1','key2'],how='outer'))#并集 数据缺失用Nan print(pd.merge(df3,df4,on=['key1','key2'],how='left'))#按照df3为参考,数据缺失用Nan print(pd.merge(df3,df4,on=['key1','key2'],how='right'))#按照df4为参考,数据缺失用Nan
结果:
A B key1 key2 C D 0 A0 B0 k0 k0 C0 D0 1 A1 B1 k1 k1 C1 D1 2 A2 B2 k2 k0 C2 D2 A B key1 key2 C D 0 A0 B0 k0 k0 C0 D0 1 A1 B1 k1 k1 C1 D1 2 A2 B2 k2 k0 C2 D2 3 A3 B3 k3 k1 NaN NaN 4 NaN NaN k3 k3 C3 D3 A B key1 key2 C D 0 A0 B0 k0 k0 C0 D0 1 A1 B1 k1 k1 C1 D1 2 A2 B2 k2 k0 C2 D2 3 A3 B3 k3 k1 NaN NaN A B key1 key2 C D 0 A0 B0 k0 k0 C0 D0 1 A1 B1 k1 k1 C1 D1 2 A2 B2 k2 k0 C2 D2 3 NaN NaN k3 k3 C3 D3
#参数left_on right_on left_index right_index -》当键不在一个列时,可以单独设置左键或右键 df1 = pd.DataFrame({'key':list('abdcjeu'), 'data1':range(7)}) df2 = pd.DataFrame({'rkey':list('abc'), 'data2':range(3)}) print(df1,'\n',df2) print(pd.merge(df1,df2,left_on = 'key',right_on = 'rkey'))#当2个dataframe中的主键名称不一致时,用left_on 和right_on 去指定键值
结果:
data1 key 0 0 a 1 1 b 2 2 d 3 3 c 4 4 j 5 5 e 6 6 u data2 rkey 0 0 a 1 1 b 2 2 c data1 key data2 rkey 0 0 a 0 a 1 1 b 1 b 2 3 c 2 c
df1 = pd.DataFrame({'key':list('abdcjeu'), 'data1':range(7)}) df2 = pd.DataFrame({'data2':range(100,105)},index = list('abcde')) print(df1) print(df2) print(pd.merge(df1,df2,left_on='key',right_index=True,sort=True))#以index为键 作为左表 key对应的键值对 sort是否按照key排序
data1 key 0 0 a 1 1 b 2 2 d 3 3 c 4 4 j 5 5 e 6 6 u data2 a 100 b 101 c 102 d 103 e 104 data1 key data2 0 0 a 100 1 1 b 101 3 3 c 102 2 2 d 103 5 5 e 104
#pd.join 直接通过索引链接 left = pd.DataFrame({'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}, index = ['k0','k1','k2','k4']) right = pd.DataFrame({'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}, index = ['k0','k1','k2','k3']) print(left) print(right) print(left.join(right)) print(left.join(right,how='outer'))#拓展
结果:
A B k0 A0 B0 k1 A1 B1 k2 A2 B2 k4 A3 B3 C D k0 C0 D0 k1 C1 D1 k2 C2 D2 k3 C3 D3 A B C D k0 A0 B0 C0 D0 k1 A1 B1 C1 D1 k2 A2 B2 C2 D2 k4 A3 B3 NaN NaN A B C D k0 A0 B0 C0 D0 k1 A1 B1 C1 D1 k2 A2 B2 C2 D2 k3 NaN NaN C3 D3 k4 A3 B3 NaN NaN
df1 = pd.DataFrame({'key':list('bbacaab'), 'data1':range(7)}) df2 = pd.DataFrame({'key':list('abc'), 'data2':range(3)}) print(pd.merge(df1,df2,left_index=True,right_index=True,suffixes=('_1','_2'))) print(df1.join(df2['data2'])) print('------')#当df1 df2的key相同时,使用suffixes 两个相同的key 成为 key_1 key_2
结果:
data1 key_1 data2 key_2 0 0 b 0 a 1 1 b 1 b 2 2 a 2 c data1 key data2 0 0 b 0.0 1 1 b 1.0 2 2 a 2.0 3 3 c NaN 4 4 a NaN 5 5 a NaN 6 6 b NaN
left = pd.DataFrame({'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3'], 'key':['k0','k1','k0','k3']}) right = pd.DataFrame({'C':['C0','C1'], 'D':['D0','D1']}, index = ['k0','k1']) print(left) print(right) print(left.join(right,on='key'))#用left的key和 right的index 合并
结果: