Pandas 合并、连接、去重、替换
1.merge合并 → 类似excel的vlookup
# merge合并 → 类似excel的vlookup df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(df1,'\n',df2,'\n') print(pd.merge(df1, df2, on='key')) #按照’key'字段进行连接 print('------','\n') # left:第一个df # right:第二个df # on:参考键 print(df3,'\n',df4,'\n') print(pd.merge(df3, df4, on=['key1','key2'])) # 多个链接键
输出结果:
A B key 0 A0 B0 K0 1 A1 B1 K1 2 A2 B2 K2 3 A3 B3 K3 C D key 0 C0 D0 K0 1 C1 D1 K1 2 C2 D2 K2 3 C3 D3 K3 A B key C D 0 A0 B0 K0 C0 D0 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 3 A3 B3 K3 C3 D3 ------ A B key1 key2 0 A0 B0 K0 K0 1 A1 B1 K0 K1 2 A2 B2 K1 K0 3 A3 B3 K2 K1 C D key1 key2 0 C0 D0 K0 K0 1 C1 D1 K1 K0 2 C2 D2 K1 K0 3 C3 D3 K2 K0 A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2
2. 参数how → 合并方式
# 参数how → 合并方式 print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner')) print('------') # inner:默认,取交集 print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer')) print('------') # outer:取并集,数据缺失范围NaN print(pd.merge(df3, df4, on=['key1','key2'], how = 'left')) print('------') # left:按照df3为参考合并,数据缺失范围NaN print(pd.merge(df3, df4, on=['key1','key2'], how = 'right')) # right:按照df4为参考合并,数据缺失范围NaN
输出结果:
A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 ------ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN 5 NaN NaN K2 K0 C3 D3 ------ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN ------ A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 3 NaN NaN K2 K0 C3 D3
3.参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键
# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键 df1 = pd.DataFrame({'lkey':list('bbacaab'), 'data1':range(7)}) df2 = pd.DataFrame({'rkey':list('abd'), 'date2':range(3)}) print(pd.merge(df1, df2, left_on='lkey', right_on='rkey')) print('------') # df1以‘lkey’为键,df2以‘rkey’为键 df1 = pd.DataFrame({'key':list('abcdfeg'), 'data1':range(7)}) df2 = pd.DataFrame({'date2':range(100,105)}, index = list('abcde')) print(pd.merge(df1, df2, left_on='key', right_index=True)) # df1以‘key’为键,df2以index为键 # left_index:为True时,第一个df以index为键,默认False # right_index:为True时,第二个df以index为键,默认False # 所以left_on, right_on, left_index, right_index可以相互组合: # left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index #merge是针对DataFrame的,Series不存在此方法
输出结果:
data1 lkey date2 rkey 0 0 b 1 b 1 1 b 1 b 2 6 b 1 b 3 2 a 0 a 4 4 a 0 a 5 5 a 0 a ------ data1 key date2 0 0 a 100 1 1 b 101 2 2 c 102 3 3 d 103 5 5 e 104
4. 连接:concat
(1)
# 连接:concat s1 = pd.Series([1,2,3]) s2 = pd.Series([2,3,4]) print(pd.concat([s1,s2])) print('-----') # 默认axis=0,行+行 s3 = pd.Series([1,2,3],index = ['a','c','h']) s4 = pd.Series([2,3,4],index = ['b','e','d']) print(s3,'\n') print(s4,'\n') print(pd.concat([s3,s4]),'\n') print(pd.concat([s3,s4]).sort_index()) print(pd.concat([s3,s4], axis=1)) print('-----') # axis=1,列+列,成为一个Dataframe
输出结果:
0 1 1 2 2 3 0 2 1 3 2 4 dtype: int64 ----- a 1 c 2 h 3 dtype: int64 b 2 e 3 d 4 dtype: int64 a 1 c 2 h 3 b 2 e 3 d 4 dtype: int64 a 1 b 2 c 2 d 4 e 3 h 3 dtype: int64 0 1 a 1.0 NaN b NaN 2.0 c 2.0 NaN d NaN 4.0 e NaN 3.0 h 3.0 NaN -----
(2)
snew = pd.concat([s3,s4],axis = 1) print(snew) # snew.reset_index(inplace = True) # print(snew) snew.reset_index(inplace = True,drop = True) snew
输出结果:
5.去重 .duplicated
# 去重 .duplicated s = pd.Series([1,1,1,1,2,2,2,3,4,5,5,5,5]) print(s.duplicated()) print(s[s.duplicated() == False]) print('-----') # 判断是否重复 # 通过布尔判断,得到不重复的值 s_re = s.drop_duplicates() print(s_re) print('-----') # drop.duplicates移除重复 # inplace参数:是否替换原值,默认False df = pd.DataFrame({'key1':['a','a',3,4,5], 'key2':['a','a','b','b','c']}) print(df,'\n') print(df.duplicated()) print(df['key2'].duplicated()) # Dataframe中使用duplicated
输出结果:
0 False 1 True 2 True 3 True 4 False 5 True 6 True 7 False 8 False 9 False 10 True 11 True 12 True dtype: bool 0 1 4 2 7 3 8 4 9 5 dtype: int64 ----- 0 1 4 2 7 3 8 4 9 5 dtype: int64 ----- key1 key2 0 a a 1 a a 2 3 b 3 4 b 4 5 c 0 False 1 True 2 False 3 False 4 False dtype: bool 0 False 1 True 2 False 3 True 4 False Name: key2, dtype: bool
6. 替换 .replace
0 a 1 s 2 c 3 a 4 a 5 z 6 s 7 d dtype: object 0 NaN 1 s 2 c 3 NaN 4 NaN 5 z 6 s 7 d dtype: object 0 NaN 1 NaN 2 c 3 NaN 4 NaN 5 z 6 NaN 7 d dtype: object 0 hello world! 1 123 2 c 3 hello world! 4 hello world! 5 z 6 123 7 d dtype: object
练习题:
作业1:按要求创建Dataframe df1、df2,并合并成df3
import numpy as np import pandas as pd df1 = pd.DataFrame({'key':list('abc'), 'values1':np.random.rand(3)}) df2 = pd.DataFrame({'key':list('bcd'), 'values2':np.random.rand(3)}) print(df1,'\n') print(df2,'\n') print(pd.merge(df1, df2,on='key',how = 'outer'))
作业2:按要求创建Dataframe df1、df2,并合并成df3
#练习2 df3 = pd.DataFrame({'key1':['a','b','c'], 'values1':np.random.rand(3)}) df4 = pd.DataFrame({'key2':['b','c','d'], 'values2':np.random.rand(3)}) print(df3,'\n') print(df4,'\n') print(pd.merge(df3, df4,left_on='key1',right_on='key2',how = 'outer'))
作业3:按要求创建Dataframe df1、df2,并合并成df3
#练习3 # df3 = pd.DataFrame({'key':['a','b','c'], # 'values1':np.random.rand(3)}) # df4 = pd.DataFrame({'values2':np.random.rand(3), # 'values3':[5,6,7]},index=['b','c','d']) # print(df3,'\n') # print(df4,'\n') # print(pd.merge(df3, df4,left_on='key',right_index= True))
作业4:按要求创建Dataframe df1、df2,并连接成df3
#练习4 # df3 = pd.DataFrame({'values1':np.random.rand(4), # 'values2':np.random.rand(4)},index = ['a','b','c','d']) # df4 = pd.DataFrame({'values1':np.random.rand(4), # 'values2':np.random.rand(4)},index=['e','f','g','h']) # print(df3,'\n') # print(df4,'\n') # print(pd.concat([df3,df4]))