Pandas 合并、连接、去重、替换

1.merge合并 → 类似excel的vlookup

# merge合并 → 类似excel的vlookup

df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})

print(df1,'\n',df2,'\n')
print(pd.merge(df1, df2, on='key'))  #按照’key'字段进行连接
print('------','\n')
# left:第一个df
# right:第二个df
# on:参考键

print(df3,'\n',df4,'\n')
print(pd.merge(df3, df4, on=['key1','key2']))
# 多个链接键

输出结果:

  A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K2
3  A3  B3  K3 
     C   D key
0  C0  D0  K0
1  C1  D1  K1
2  C2  D2  K2
3  C3  D3  K3 

    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3
------ 

    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1 
     C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0 

    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2

2. 参数how → 合并方式

# 参数how → 合并方式

print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner'))  
print('------')
# inner:默认,取交集

print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer'))  
print('------')
# outer:取并集,数据缺失范围NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))  
print('------')
# left:按照df3为参考合并,数据缺失范围NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'right'))  
# right:按照df4为参考合并,数据缺失范围NaN

输出结果:

 A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2
------
     A    B key1 key2    C    D
0   A0   B0   K0   K0   C0   D0
1   A1   B1   K0   K1  NaN  NaN
2   A2   B2   K1   K0   C1   D1
3   A2   B2   K1   K0   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K2   K0   C3   D3
------
    A   B key1 key2    C    D
0  A0  B0   K0   K0   C0   D0
1  A1  B1   K0   K1  NaN  NaN
2  A2  B2   K1   K0   C1   D1
3  A2  B2   K1   K0   C2   D2
4  A3  B3   K2   K1  NaN  NaN
------
     A    B key1 key2   C   D
0   A0   B0   K0   K0  C0  D0
1   A2   B2   K1   K0  C1  D1
2   A2   B2   K1   K0  C2  D2
3  NaN  NaN   K2   K0  C3  D3

3.参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键

# 参数 left_on, right_on, left_index, right_index → 当键不为一个列时,可以单独设置左键与右键

df1 = pd.DataFrame({'lkey':list('bbacaab'),
                   'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
                   'date2':range(3)})
print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
print('------')
# df1以‘lkey’为键,df2以‘rkey’为键

df1 = pd.DataFrame({'key':list('abcdfeg'),
                   'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
                  index = list('abcde'))
print(pd.merge(df1, df2, left_on='key', right_index=True))
# df1以‘key’为键,df2以index为键
# left_index:为True时,第一个df以index为键,默认False
# right_index:为True时,第二个df以index为键,默认False

# 所以left_on, right_on, left_index, right_index可以相互组合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index 
#merge是针对DataFrame的,Series不存在此方法

输出结果:

 data1 lkey  date2 rkey
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a
------
   data1 key  date2
0      0   a    100
1      1   b    101
2      2   c    102
3      3   d    103
5      5   e    104

4. 连接:concat

(1)

# 连接:concat

s1 = pd.Series([1,2,3])
s2 = pd.Series([2,3,4])
print(pd.concat([s1,s2]))
print('-----')
# 默认axis=0,行+行

s3 = pd.Series([1,2,3],index = ['a','c','h'])
s4 = pd.Series([2,3,4],index = ['b','e','d'])
print(s3,'\n')
print(s4,'\n')
print(pd.concat([s3,s4]),'\n')
print(pd.concat([s3,s4]).sort_index())
print(pd.concat([s3,s4], axis=1))
print('-----')
# axis=1,列+列,成为一个Dataframe

输出结果:

0    1
1    2
2    3
0    2
1    3
2    4
dtype: int64
-----
a    1
c    2
h    3
dtype: int64 

b    2
e    3
d    4
dtype: int64 

a    1
c    2
h    3
b    2
e    3
d    4
dtype: int64 

a    1
b    2
c    2
d    4
e    3
h    3
dtype: int64
     0    1
a  1.0  NaN
b  NaN  2.0
c  2.0  NaN
d  NaN  4.0
e  NaN  3.0
h  3.0  NaN
-----

(2)

snew  = pd.concat([s3,s4],axis = 1)
print(snew)
# snew.reset_index(inplace = True)
# print(snew)
snew.reset_index(inplace = True,drop = True)
snew

输出结果:

5.去重 .duplicated

# 去重 .duplicated

s = pd.Series([1,1,1,1,2,2,2,3,4,5,5,5,5])
print(s.duplicated())
print(s[s.duplicated() == False])
print('-----')
# 判断是否重复
# 通过布尔判断,得到不重复的值

s_re = s.drop_duplicates()
print(s_re)
print('-----')
# drop.duplicates移除重复
# inplace参数:是否替换原值,默认False

df = pd.DataFrame({'key1':['a','a',3,4,5],
                  'key2':['a','a','b','b','c']})
print(df,'\n')
print(df.duplicated())
print(df['key2'].duplicated())
# Dataframe中使用duplicated

输出结果:

0     False
1      True
2      True
3      True
4     False
5      True
6      True
7     False
8     False
9     False
10     True
11     True
12     True
dtype: bool
0    1
4    2
7    3
8    4
9    5
dtype: int64
-----
0    1
4    2
7    3
8    4
9    5
dtype: int64
-----
  key1 key2
0    a    a
1    a    a
2    3    b
3    4    b
4    5    c 

0    False
1     True
2    False
3    False
4    False
dtype: bool
0    False
1     True
2    False
3     True
4    False
Name: key2, dtype: bool

6. 替换 .replace

0    a
1    s
2    c
3    a
4    a
5    z
6    s
7    d
dtype: object 

0    NaN
1      s
2      c
3    NaN
4    NaN
5      z
6      s
7      d
dtype: object
0    NaN
1    NaN
2      c
3    NaN
4    NaN
5      z
6    NaN
7      d
dtype: object
0    hello world!
1             123
2               c
3    hello world!
4    hello world!
5               z
6             123
7               d
dtype: object

 练习题:

作业1:按要求创建Dataframe df1、df2,并合并成df3

import numpy as np
import pandas as pd

df1 = pd.DataFrame({'key':list('abc'),
                   'values1':np.random.rand(3)})
df2 = pd.DataFrame({'key':list('bcd'),
                   'values2':np.random.rand(3)})
print(df1,'\n')
print(df2,'\n')
print(pd.merge(df1, df2,on='key',how = 'outer'))

作业2:按要求创建Dataframe df1、df2,并合并成df3

#练习2
df3 = pd.DataFrame({'key1':['a','b','c'],
                   'values1':np.random.rand(3)})
df4 = pd.DataFrame({'key2':['b','c','d'],
                   'values2':np.random.rand(3)})
print(df3,'\n')
print(df4,'\n')

print(pd.merge(df3, df4,left_on='key1',right_on='key2',how = 'outer'))

 作业3:按要求创建Dataframe df1、df2,并合并成df3

#练习3
# df3 = pd.DataFrame({'key':['a','b','c'],
#                    'values1':np.random.rand(3)})
# df4 = pd.DataFrame({'values2':np.random.rand(3),
#                    'values3':[5,6,7]},index=['b','c','d'])
# print(df3,'\n')
# print(df4,'\n')
# print(pd.merge(df3, df4,left_on='key',right_index= True))

作业4:按要求创建Dataframe df1、df2,并连接成df3

#练习4
# df3 = pd.DataFrame({'values1':np.random.rand(4),
#                   'values2':np.random.rand(4)},index = ['a','b','c','d'])
# df4 = pd.DataFrame({'values1':np.random.rand(4),
#                     'values2':np.random.rand(4)},index=['e','f','g','h'])
# print(df3,'\n')
# print(df4,'\n')
# print(pd.concat([df3,df4]))

 

posted @ 2018-11-07 15:54  RamboBai  阅读(812)  评论(0编辑  收藏  举报