轴向连接(concat)
Numpy
import numpy as np
import pandas as pd
from pandas import Series
arr = np.arange(12).reshape(3,4)
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
# axis默认为行,想合并列可以设置axis=1
np.concatenate([arr,arr])
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
pandas对象的轴向连接
# 三个没有重叠的索引合在一起
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6], index=['f','g'])
# concat对象里面需要接受一个可迭代的对象
pd.concat([s1,s2,s3])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
# 如果传入axis=1,则多一个轴方向,会变成DataFrame
pd.concat([s1,s2,s3],axis=1)
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
s4 = pd.concat([s1*5,s3])
s4
a 0
b 5
f 5
g 6
dtype: int64
# 默认合并的还是outer
pd.concat([s1,s4],axis=1)
0 1
a 0.0 0
b 1.0 5
f NaN 5
g NaN 6
# 默认合并的还是outer,如果想得到合并的交集,则指定join = 'inner'
pd.concat([s1,s4],axis=1,join='inner')
0 1
a 0 0
b 1 5
# 指定要合并的索引名,如果没有,则合并为NaN
pd.concat([s1,s4],axis=1,join_axes=[['a','b','c','e']])
0 1
a 0.0 0.0
b 1.0 5.0
c NaN NaN
e NaN NaN
#在合并行索引上创建一个层次化索引,keys参数
pd.concat([s1,s4],keys=['one','two','three'])
one a 0
b 1
two a 0
b 5
f 5
g 6
dtype: int64
pd.concat([s1,s2,s3],axis=1)
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
#如果沿着axis=1对Series进行合并,则keys就会成为DATAFrame的列名
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
DataFrame的concat操作
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df1
one two
a 0 1
b 2 3
c 4 5
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2
three four
a 5 6
c 7 8
# 合并列
pd.concat([df1,df2],axis=1)
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
# 如果传入的不是列表而是一个字典,则字典的键就是层次化索引列名
pd.concat({'level1':df1,'level2':df2},axis=1)
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
# names的命名是层次化索引的行标签,upper行对应level1,level2
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])
upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
df3 = pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df4 = pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])
# 这样的行索引重复的难看要死,可以关闭了,ignore_index=True
pd.concat([df3,df4])
a b c d
0 0.649869 -0.332470 0.918562 -1.781167
1 -0.271012 0.702998 -2.164433 0.185556
2 0.279104 -0.846209 -0.366614 0.444451
0 -0.204010 -0.974424 NaN -2.215621
1 0.504930 0.490877 NaN 0.332790
#ingore_index启用后,行索引就会自增
pd.concat([df3,df4],ignore_index=True)
a b c d
0 0.649869 -0.332470 0.918562 -1.781167
1 -0.271012 0.702998 -2.164433 0.185556
2 0.279104 -0.846209 -0.366614 0.444451
3 -0.204010 -0.974424 NaN -2.215621
4 0.504930 0.490877 NaN 0.332790
合并重叠数据(combine_first)
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
a
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a 5.0
dtype: float64
# where(条件,真值,假值),这里a数据集有null条件成立,故返回b的值
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 2. , 3.5, 4.5, 5. ])
a[2:]
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
b[:-2]
f 0.0
e 1.0
d 2.0
c 3.0
dtype: float64
# 用a的数据填补b,如果有重复的以b为准
b[:-2].combine_first(a[2:])
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64
# 用b的数据填补a,如果有重复的,以a为准
a[2:].combine_first(b[:-2])
a NaN
b 4.5
c 3.5
d 2.0
e 1.0
f 0.0
dtype: float64