2018.03.27 python pandas merge join 使用

#2.16 合并 merge-join
import numpy as np
import pandas as pd
df1 = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']})
df2 = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})
df3 = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                    'key2':['k0','k1','k0','k1'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']})
df4 = pd.DataFrame({'key1':['k0','k1','k2','k3'],
                    'key2':['k0','k1','k0','k3'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})
print(df1)
print(df2)
print(pd.merge(df1,df2,on='key1'))
print('------')
#on 参考键

print(df3)
print(df4)
print(pd.merge(df3,df4,on=['key1','key2']))#2个键值同时去匹配

结果:

    A   B key1
0  A0  B0   k0
1  A1  B1   k1
2  A2  B2   k2
3  A3  B3   k3
    C   D key1
0  C0  D0   k0
1  C1  D1   k1
2  C2  D2   k2
3  C3  D3   k3
    A   B key1   C   D
0  A0  B0   k0  C0  D0
1  A1  B1   k1  C1  D1
2  A2  B2   k2  C2  D2
3  A3  B3   k3  C3  D3
------
    A   B key1 key2
0  A0  B0   k0   k0
1  A1  B1   k1   k1
2  A2  B2   k2   k0
3  A3  B3   k3   k1
    C   D key1 key2
0  C0  D0   k0   k0
1  C1  D1   k1   k1
2  C2  D2   k2   k0
3  C3  D3   k3   k3
    A   B key1 key2   C   D
0  A0  B0   k0   k0  C0  D0
1  A1  B1   k1   k1  C1  D1
2  A2  B2   k2   k0  C2  D2
#参数how 合并方式
print(pd.merge(df3,df4,on=['key1','key2'],how='inner'))#取交集
print(pd.merge(df3,df4,on=['key1','key2'],how='outer'))#并集  数据缺失用Nan
print(pd.merge(df3,df4,on=['key1','key2'],how='left'))#按照df3为参考,数据缺失用Nan
print(pd.merge(df3,df4,on=['key1','key2'],how='right'))#按照df4为参考,数据缺失用Nan

 结果:

    A   B key1 key2   C   D
0  A0  B0   k0   k0  C0  D0
1  A1  B1   k1   k1  C1  D1
2  A2  B2   k2   k0  C2  D2
     A    B key1 key2    C    D
0   A0   B0   k0   k0   C0   D0
1   A1   B1   k1   k1   C1   D1
2   A2   B2   k2   k0   C2   D2
3   A3   B3   k3   k1  NaN  NaN
4  NaN  NaN   k3   k3   C3   D3
    A   B key1 key2    C    D
0  A0  B0   k0   k0   C0   D0
1  A1  B1   k1   k1   C1   D1
2  A2  B2   k2   k0   C2   D2
3  A3  B3   k3   k1  NaN  NaN
     A    B key1 key2   C   D
0   A0   B0   k0   k0  C0  D0
1   A1   B1   k1   k1  C1  D1
2   A2   B2   k2   k0  C2  D2
3  NaN  NaN   k3   k3  C3  D3
#参数left_on right_on left_index right_index -》当键不在一个列时,可以单独设置左键或右键
df1 = pd.DataFrame({'key':list('abdcjeu'),
                    'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abc'),
                    'data2':range(3)})
print(df1,'\n',df2)
print(pd.merge(df1,df2,left_on = 'key',right_on = 'rkey'))#当2个dataframe中的主键名称不一致时,用left_on 和right_on 去指定键值

结果:

   data1 key
0      0   a
1      1   b
2      2   d
3      3   c
4      4   j
5      5   e
6      6   u 
    data2 rkey
0      0    a
1      1    b
2      2    c
   data1 key  data2 rkey
0      0   a      0    a
1      1   b      1    b
2      3   c      2    c
df1 = pd.DataFrame({'key':list('abdcjeu'),
                    'data1':range(7)})
df2 = pd.DataFrame({'data2':range(100,105)},index = list('abcde'))
print(df1)
print(df2)
print(pd.merge(df1,df2,left_on='key',right_index=True,sort=True))#以index为键 作为左表 key对应的键值对  sort是否按照key排序
   data1 key
0      0   a
1      1   b
2      2   d
3      3   c
4      4   j
5      5   e
6      6   u
   data2
a    100
b    101
c    102
d    103
e    104
   data1 key  data2
0      0   a    100
1      1   b    101
3      3   c    102
2      2   d    103
5      5   e    104
#pd.join 直接通过索引链接
left = pd.DataFrame({'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']},
                   index = ['k0','k1','k2','k4'])
right = pd.DataFrame({'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']},
                   index = ['k0','k1','k2','k3'])
print(left)
print(right)
print(left.join(right))
print(left.join(right,how='outer'))#拓展

结果:

     A   B
k0  A0  B0
k1  A1  B1
k2  A2  B2
k4  A3  B3
     C   D
k0  C0  D0
k1  C1  D1
k2  C2  D2
k3  C3  D3
     A   B    C    D
k0  A0  B0   C0   D0
k1  A1  B1   C1   D1
k2  A2  B2   C2   D2
k4  A3  B3  NaN  NaN
      A    B    C    D
k0   A0   B0   C0   D0
k1   A1   B1   C1   D1
k2   A2   B2   C2   D2
k3  NaN  NaN   C3   D3
k4   A3   B3  NaN  NaN
df1 = pd.DataFrame({'key':list('bbacaab'),
                    'data1':range(7)})
df2 = pd.DataFrame({'key':list('abc'),
                    'data2':range(3)})
print(pd.merge(df1,df2,left_index=True,right_index=True,suffixes=('_1','_2')))
print(df1.join(df2['data2']))
print('------')#当df1 df2的key相同时,使用suffixes 两个相同的key 成为 key_1 key_2

结果:

   data1 key_1  data2 key_2
0      0     b      0     a
1      1     b      1     b
2      2     a      2     c
   data1 key  data2
0      0   b    0.0
1      1   b    1.0
2      2   a    2.0
3      3   c    NaN
4      4   a    NaN
5      5   a    NaN
6      6   b    NaN
left = pd.DataFrame({'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3'],
                    'key':['k0','k1','k0','k3']})
right = pd.DataFrame({'C':['C0','C1'],
                    'D':['D0','D1']},
                   index = ['k0','k1'])
print(left)
print(right)
print(left.join(right,on='key'))#用left的key和 right的index 合并

结果:

    A   B key
0  A0  B0  k0
1  A1  B1  k1
2  A2  B2  k0
3  A3  B3  k3
     C   D
k0  C0  D0
k1  C1  D1
    A   B key    C    D
0  A0  B0  k0   C0   D0
1  A1  B1  k1   C1   D1
2  A2  B2  k0   C0   D0
3  A3  B3  k3  NaN  NaN
posted @ 2018-03-27 16:57  TAB_Zhu  阅读(390)  评论(0编辑  收藏  举报