数据规整化:清理、转换、合并、重塑
你也可以来看我的Github上的原文,欢迎交流:
https://github.com/AsuraDong/Blog/blob/master/Articles/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E6%95%B0%E6%8D%AE%E8%A7%84%E6%95%B4%E5%8C%96%EF%BC%9A%E6%B8%85%E7%90%86%E3%80%81%E8%BD%AC%E6%8D%A2%E3%80%81%E5%90%88%E5%B9%B6%E3%80%81%E9%87%8D%E5%A1%91.md
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
1.合并数据集
- pd.merge():各种参数的使用
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],\
'data1':[i for i in range(7)]})
df2 = DataFrame({'key':['a','b','d'],\
'data2':[i for i in range(3)]})
print(df1)
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
print(df2)
data2 key
0 0 a
1 1 b
2 2 d
print(pd.merge(df1,df2,on='key') )
#pd1 和 pd2 进行inner联结
#on :指明将列当做键。默认是重叠的。
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],\
'data1':[i for i in range(7)]})
df4 = DataFrame({'key2':['a','b','d'],\
'data2':[i for i in range(3)]})
# 如果没有重叠的列名
# 分别指定
print(pd.merge(df3,df4,left_on="key1",right_on='key2'))
data1 key1 data2 key2
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
print(pd.merge(df1,df2,how="outer")) # 类似数据库的操作:inner、outer、left、right
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
1.1索引上的合并
left1 = DataFrame({'key':['a','b','a','a','b','c'],\
'data1':[i for i in range(6)]})
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
print(right1)
print(left1)
group_val
a 3.5
b 7.0
data1 key
0 0 a
1 1 b
2 2 a
3 3 a
4 4 b
5 5 c
print(pd.merge(left1,right1,left_on='key',right_index=True))
data1 key group_val
0 0 a 3.5
2 2 a 3.5
3 3 a 3.5
1 1 b 7.0
4 4 b 7.0
1.2轴向连接
- np.concatenate
- pd.concat([s1,s2,s3])
arr = np.arange(12).reshape((3,4))
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
np.concatenate([arr,arr],axis=1) #numpy对象的方法
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
print(pd.concat([s1,s2,s3],axis=1)) # 按照列
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
s4 = pd.concat([s1*5,s3])
print(pd.concat([s1,s4],axis=1))
print(pd.concat([s1,s4],axis=1,join="inner")) #取交集
print(pd.concat([s1,s1,s1],keys=['one','two','three']))# 创建层次化索引
print(pd.concat([s1,s1,s1],axis=1,keys=['one','two','three']))# 此时,层次化keys会变成列头
0 1
a 0.0 0
b 1.0 5
f NaN 5
g NaN 6
0 1
a 0 0
b 1 5
one a 0
b 1
two a 0
b 1
three a 0
b 1
dtype: int64
one two three
a 0 0 0
b 1 1 1
# 下面展示对于DataFrame的效果
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],\
columns=['one','two'])
df2 = DataFrame(np.arange(4).reshape(2,2),index=['a','c'],\
columns=['three','four'])
print(df1);print(df2)
one two
a 0 1
b 2 3
c 4 5
three four
a 0 1
c 2 3
print(pd.concat([df1,df2],axis=1,keys=['level1','level2']))
print(pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower']))
level1 level2
one two three four
a 0 1 0.0 1.0
b 2 3 NaN NaN
c 4 5 2.0 3.0
upper level1 level2
lower one two three four
a 0 1 0.0 1.0
b 2 3 NaN NaN
c 4 5 2.0 3.0
df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
print(df1)
print(df2)
a b c d
0 -0.447944 0.147116 0.342810 -0.618590
1 0.930223 0.379391 -0.994802 0.008333
2 -0.239952 0.150497 -0.603801 -1.153930
b d a
0 -0.332100 0.245658 1.448707
1 1.137389 -0.170095 -1.552497
print(pd.concat([df1,df2]))
print("应该用这种发法")
print(pd.concat([df1,df2],ignore_index=True)) #不保留原来的行索引,产生新的行索引
a b c d
0 -0.447944 0.147116 0.342810 -0.618590
1 0.930223 0.379391 -0.994802 0.008333
2 -0.239952 0.150497 -0.603801 -1.153930
0 1.448707 -0.332100 NaN 0.245658
1 -1.552497 1.137389 NaN -0.170095
应该用这种发法
a b c d
0 -0.447944 0.147116 0.342810 -0.618590
1 0.930223 0.379391 -0.994802 0.008333
2 -0.239952 0.150497 -0.603801 -1.153930
3 1.448707 -0.332100 NaN 0.245658
4 -1.552497 1.137389 NaN -0.170095
1.4合并重叠数据
- np.where
- pandas:combine_first
# Array或者Series合并
a = Series([1,2,3,np.nan,4])
b= Series([1,3,3,4,np.nan])
print(np.where(pd.isnull(a),b,a))
[ 1. 2. 3. 4. 4.]
# Pandas提供了combine_first方法
df1 = DataFrame({'a':[1,2,np.nan],\
'b':[2,3,5],\
'c':[8,np.nan,0]})
df2 = DataFrame({'a':[5.,4],\
'b':[np.nan,3.]})
print(df1.combine_first(df2))
a b c
0 1.0 2 8.0
1 2.0 3 NaN
2 NaN 5 0.0
2.重塑和轴向旋转
2.1重塑层次化索引
- stack:将列旋转为行
- unstack:将行旋转为列
- 默认情况下都是最内层
data = DataFrame(np.arange(6).reshape((2,3)),\
index=pd.Index(['Ohio','Colorade'],name='state'),\
columns=pd.Index(['onw','two','three'],name='number'))
print(data)
number onw two three
state
Ohio 0 1 2
Colorade 3 4 5
result = data.stack()
print(result)
state number
Ohio onw 0
two 1
three 2
Colorade onw 3
two 4
three 5
dtype: int32
print(result.unstack()) #我们再重排回去
number onw two three
state
Ohio 0 1 2
Colorade 3 4 5
print(result.unstack(1))
print(result.unstack(0))
print(result.unstack('state'))
number onw two three
state
Ohio 0 1 2
Colorade 3 4 5
state Ohio Colorade
number
onw 0 3
two 1 4
three 2 5
state Ohio Colorade
number
onw 0 3
two 1 4
three 2 5
2.2长格式=>宽格式
数据的格式如下:
处理成如下形式:
3.过滤、清洗和其他转换工作
Part:1-2 讲的是过滤工作
3.1移除重复数据
- duplicated():返回是否重复列
- drop_duplicates():去除重复列
data = DataFrame({'k1':['one']*3+['two']*4,\
'k2':[1,1,2,3,3,4,4]})
print(data)
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
print(data.duplicated())
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
print(data.drop_duplicates())
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
# 若,只希望根据k1列过滤重复项
data['v1'] = range(7)
print(data.drop_duplicates(['k1']))
k1 k2 v1
0 one 1 0
3 two 3 3
3.2利用函数映射进行数据转换
就是调用map函数:可以一个函数或者有映射关系的字典
data = DataFrame({'food':['a','b','c']})
print(data)
food
0 a
1 b
2 c
animal_to_food = {'a':'pig','b':'dog','c':'cat'}
data['animal'] = data['food'].map(str.lower).map(animal_to_food)
print(data)
food animal
0 a pig
1 b dog
2 c cat
del data['animal']
# 通过函数来完成
data['food'].map(lambda x: animal_to_food[x])
0 pig
1 dog
2 cat
Name: food, dtype: object
3.3替换值
fillna
和map
都可以,但更好的是:replace
data = Series([1,-999,-10000000,5])
print(data)
data = data.replace(-999,np.nan).copy()
print(data)
# 一次性替换多个值
# data.replace([-999,-1000],np.nan)
# 对应替换
# data.replace([-999,-1000],[np.nan,0])
# 对应替换:字典
# data.replace({-999:np.nan})
0 1
1 -999
2 -10000000
3 5
dtype: int64
0 1.0
1 NaN
2 -10000000.0
3 5.0
dtype: float64
3.4重命名轴/列索引
- 方法一:data.index/columns:都有map方法
- 方法二(推荐):data.rename(index = func,columns = func):func也可以是映射关系的字典
data = DataFrame(np.arange(12).reshape((3,4)),\
index=['Ohio','Colorda','NesYord'],\
columns = ['One','two','three','four'])
print(data)
One two three four
Ohio 0 1 2 3
Colorda 4 5 6 7
NesYord 8 9 10 11
data.index = data.index.map(str.upper)
print(data)
One two three four
OHIO 0 1 2 3
COLORDA 4 5 6 7
NESYORD 8 9 10 11
print(data.rename(index=str.title,columns=str.upper))
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colorda 4 5 6 7
Nesyord 8 9 10 11
# 如果希望就地修改某个数据集,inplace=True
data.rename(index={'OHIO':'oooo'},columns=str.lower,inplace = True)
print(data)
one two three four
oooo 0 1 2 3
COLORDA 4 5 6 7
NESYORD 8 9 10 11
3.5分组:离散化和面元划分
pd.cut()
函数 pd.qcut()
:会获得数量相等的面元
ages = [20,22,25,27,21,23,37,31,61,45]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins) # bins可以是整数,自动计算。参数labels:每一组的名字。right可以修改左右开闭
print(cats)
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], (18, 25], (35, 60], (25, 35], (60, 100], (35, 60]]
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
print(cats.codes) #对应的第几个区间
[0 0 0 1 0 0 2 1 3 2]
print(pd.value_counts(cats)) # 查找
(18, 25] 5
(35, 60] 2
(25, 35] 2
(60, 100] 1
dtype: int64
3.6检测过滤异常值
- 注意
describe()
的应用
np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
print(data.describe())
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528
col = data[3]
print(col[np.abs(col)>3])
97 3.927528
305 -3.399312
400 -3.745356
Name: 3, dtype: float64
print(data[(np.abs(data)>3).any(1)]) # 出现绝对值大于3的行
0 1 2 3
5 -0.539741 0.476985 3.248944 -1.021228
97 -0.774363 0.552936 0.106061 3.927528
102 -0.655054 -0.565230 3.176873 0.959533
305 -2.315555 0.457246 -0.025907 -3.399312
324 0.050188 1.951312 3.260383 0.963301
400 0.146326 0.508391 -0.196713 -3.745356
499 -0.293333 -0.242459 -3.056990 1.918403
523 -3.428254 -0.296336 -0.439938 -0.867165
586 0.275144 1.179227 -3.184377 1.369891
808 -0.362528 -3.548824 1.553205 -2.186301
900 3.366626 -2.372214 0.851010 1.332846
3.7排列和随机采样
- np.random.permutation():返回随机重排列的index
- DataFrame.take(上面的index):重新排列
df = DataFrame(np.arange(5*4).reshape(5,4))
sampler = np.random.permutation(len(df))
print(sampler)
[1 0 4 3 2]
print(df)
print(df.take(sampler))
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
0 1 2 3
1 4 5 6 7
0 0 1 2 3
4 16 17 18 19
3 12 13 14 15
2 8 9 10 11