数据规整化:清理、转换、合并、重塑

你也可以来看我的Github上的原文,欢迎交流:
https://github.com/AsuraDong/Blog/blob/master/Articles/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E6%95%B0%E6%8D%AE%E8%A7%84%E6%95%B4%E5%8C%96%EF%BC%9A%E6%B8%85%E7%90%86%E3%80%81%E8%BD%AC%E6%8D%A2%E3%80%81%E5%90%88%E5%B9%B6%E3%80%81%E9%87%8D%E5%A1%91.md
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series

1.合并数据集

  • pd.merge():各种参数的使用
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],\
                   'data1':[i for i in range(7)]})
df2 = DataFrame({'key':['a','b','d'],\
                    'data2':[i for i in range(3)]})
print(df1)
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
print(df2)
   data2 key
0      0   a
1      1   b
2      2   d
print(pd.merge(df1,df2,on='key') )
#pd1 和 pd2 进行inner联结
#on :指明将列当做键。默认是重叠的。
   data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0
df3 = DataFrame({'key1':['b','b','a','c','a','a','b'],\
                   'data1':[i for i in range(7)]})
df4 = DataFrame({'key2':['a','b','d'],\
                    'data2':[i for i in range(3)]})
# 如果没有重叠的列名
# 分别指定
print(pd.merge(df3,df4,left_on="key1",right_on='key2'))
   data1 key1  data2 key2
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a
print(pd.merge(df1,df2,how="outer")) # 类似数据库的操作:inner、outer、left、right
   data1 key  data2
0    0.0   b    1.0
1    1.0   b    1.0
2    6.0   b    1.0
3    2.0   a    0.0
4    4.0   a    0.0
5    5.0   a    0.0
6    3.0   c    NaN
7    NaN   d    2.0

1.1索引上的合并

left1 = DataFrame({'key':['a','b','a','a','b','c'],\
                   'data1':[i for i in range(6)]})
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
print(right1)
print(left1)
   group_val
a        3.5
b        7.0
   data1 key
0      0   a
1      1   b
2      2   a
3      3   a
4      4   b
5      5   c
print(pd.merge(left1,right1,left_on='key',right_index=True))
   data1 key  group_val
0      0   a        3.5
2      2   a        3.5
3      3   a        3.5
1      1   b        7.0
4      4   b        7.0

1.2轴向连接

  • np.concatenate
  • pd.concat([s1,s2,s3])
arr = np.arange(12).reshape((3,4))
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
np.concatenate([arr,arr],axis=1) #numpy对象的方法
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
print(pd.concat([s1,s2,s3],axis=1)) # 按照列
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0
s4 = pd.concat([s1*5,s3])
print(pd.concat([s1,s4],axis=1))
print(pd.concat([s1,s4],axis=1,join="inner")) #取交集
print(pd.concat([s1,s1,s1],keys=['one','two','three']))# 创建层次化索引
print(pd.concat([s1,s1,s1],axis=1,keys=['one','two','three']))# 此时,层次化keys会变成列头
     0  1
a  0.0  0
b  1.0  5
f  NaN  5
g  NaN  6
   0  1
a  0  0
b  1  5
one    a    0
       b    1
two    a    0
       b    1
three  a    0
       b    1
dtype: int64
   one  two  three
a    0    0      0
b    1    1      1
# 下面展示对于DataFrame的效果
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],\
               columns=['one','two'])
df2 = DataFrame(np.arange(4).reshape(2,2),index=['a','c'],\
               columns=['three','four'])
print(df1);print(df2)
   one  two
a    0    1
b    2    3
c    4    5
   three  four
a      0     1
c      2     3
print(pd.concat([df1,df2],axis=1,keys=['level1','level2']))
print(pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower']))
  level1     level2     
     one two  three four
a      0   1    0.0  1.0
b      2   3    NaN  NaN
c      4   5    2.0  3.0
upper level1     level2     
lower    one two  three four
a          0   1    0.0  1.0
b          2   3    NaN  NaN
c          4   5    2.0  3.0
df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
print(df1)
print(df2)
          a         b         c         d
0 -0.447944  0.147116  0.342810 -0.618590
1  0.930223  0.379391 -0.994802  0.008333
2 -0.239952  0.150497 -0.603801 -1.153930
          b         d         a
0 -0.332100  0.245658  1.448707
1  1.137389 -0.170095 -1.552497
print(pd.concat([df1,df2]))

print("应该用这种发法")
print(pd.concat([df1,df2],ignore_index=True)) #不保留原来的行索引,产生新的行索引
          a         b         c         d
0 -0.447944  0.147116  0.342810 -0.618590
1  0.930223  0.379391 -0.994802  0.008333
2 -0.239952  0.150497 -0.603801 -1.153930
0  1.448707 -0.332100       NaN  0.245658
1 -1.552497  1.137389       NaN -0.170095
应该用这种发法
          a         b         c         d
0 -0.447944  0.147116  0.342810 -0.618590
1  0.930223  0.379391 -0.994802  0.008333
2 -0.239952  0.150497 -0.603801 -1.153930
3  1.448707 -0.332100       NaN  0.245658
4 -1.552497  1.137389       NaN -0.170095

1.4合并重叠数据

  • np.where
  • pandas:combine_first
# Array或者Series合并
a = Series([1,2,3,np.nan,4])
b= Series([1,3,3,4,np.nan])
print(np.where(pd.isnull(a),b,a))
[ 1.  2.  3.  4.  4.]
# Pandas提供了combine_first方法
df1 = DataFrame({'a':[1,2,np.nan],\
                'b':[2,3,5],\
                'c':[8,np.nan,0]})
df2 = DataFrame({'a':[5.,4],\
                'b':[np.nan,3.]})
print(df1.combine_first(df2))
     a  b    c
0  1.0  2  8.0
1  2.0  3  NaN
2  NaN  5  0.0

2.重塑和轴向旋转

2.1重塑层次化索引

  • stack:将列旋转为行
  • unstack:将行旋转为列
  • 默认情况下都是最内层
data = DataFrame(np.arange(6).reshape((2,3)),\
                index=pd.Index(['Ohio','Colorade'],name='state'),\
                columns=pd.Index(['onw','two','three'],name='number'))
print(data)
number    onw  two  three
state                    
Ohio        0    1      2
Colorade    3    4      5
result = data.stack()
print(result)
state     number
Ohio      onw       0
          two       1
          three     2
Colorade  onw       3
          two       4
          three     5
dtype: int32
print(result.unstack()) #我们再重排回去
number    onw  two  three
state                    
Ohio        0    1      2
Colorade    3    4      5
print(result.unstack(1))
print(result.unstack(0))
print(result.unstack('state'))
number    onw  two  three
state                    
Ohio        0    1      2
Colorade    3    4      5
state   Ohio  Colorade
number                
onw        0         3
two        1         4
three      2         5
state   Ohio  Colorade
number                
onw        0         3
two        1         4
three      2         5

2.2长格式=>宽格式

数据的格式如下:

处理成如下形式:

3.过滤、清洗和其他转换工作

Part:1-2 讲的是过滤工作

3.1移除重复数据

  • duplicated():返回是否重复列
  • drop_duplicates():去除重复列
data = DataFrame({'k1':['one']*3+['two']*4,\
                 'k2':[1,1,2,3,3,4,4]})
print(data)
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
print(data.duplicated())
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
print(data.drop_duplicates())
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4
# 若,只希望根据k1列过滤重复项
data['v1'] = range(7)
print(data.drop_duplicates(['k1']))
    k1  k2  v1
0  one   1   0
3  two   3   3

3.2利用函数映射进行数据转换

就是调用map函数:可以一个函数或者有映射关系的字典

data = DataFrame({'food':['a','b','c']})
print(data)
  food
0    a
1    b
2    c
animal_to_food = {'a':'pig','b':'dog','c':'cat'}
data['animal'] = data['food'].map(str.lower).map(animal_to_food)
print(data)
  food animal
0    a    pig
1    b    dog
2    c    cat
del data['animal']
# 通过函数来完成
data['food'].map(lambda x: animal_to_food[x])
0    pig
1    dog
2    cat
Name: food, dtype: object

3.3替换值

fillnamap都可以,但更好的是:replace

data = Series([1,-999,-10000000,5])
print(data)
data = data.replace(-999,np.nan).copy()
print(data)
# 一次性替换多个值
# data.replace([-999,-1000],np.nan)
# 对应替换
# data.replace([-999,-1000],[np.nan,0])
# 对应替换:字典
# data.replace({-999:np.nan})
0           1
1        -999
2   -10000000
3           5
dtype: int64
0           1.0
1           NaN
2   -10000000.0
3           5.0
dtype: float64

3.4重命名轴/列索引

  • 方法一:data.index/columns:都有map方法
  • 方法二(推荐):data.rename(index = func,columns = func):func也可以是映射关系的字典
data = DataFrame(np.arange(12).reshape((3,4)),\
                index=['Ohio','Colorda','NesYord'],\
                columns = ['One','two','three','four'])
print(data)
         One  two  three  four
Ohio       0    1      2     3
Colorda    4    5      6     7
NesYord    8    9     10    11
data.index = data.index.map(str.upper)
print(data)
         One  two  three  four
OHIO       0    1      2     3
COLORDA    4    5      6     7
NESYORD    8    9     10    11
print(data.rename(index=str.title,columns=str.upper))
         ONE  TWO  THREE  FOUR
Ohio       0    1      2     3
Colorda    4    5      6     7
Nesyord    8    9     10    11
# 如果希望就地修改某个数据集,inplace=True
data.rename(index={'OHIO':'oooo'},columns=str.lower,inplace = True)
print(data)
         one  two  three  four
oooo       0    1      2     3
COLORDA    4    5      6     7
NESYORD    8    9     10    11

3.5分组:离散化和面元划分

pd.cut()函数 
pd.qcut():会获得数量相等的面元

ages = [20,22,25,27,21,23,37,31,61,45]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins) # bins可以是整数,自动计算。参数labels:每一组的名字。right可以修改左右开闭
print(cats)
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], (18, 25], (35, 60], (25, 35], (60, 100], (35, 60]]
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
print(cats.codes) #对应的第几个区间
[0 0 0 1 0 0 2 1 3 2]
print(pd.value_counts(cats)) # 查找
(18, 25]     5
(35, 60]     2
(25, 35]     2
(60, 100]    1
dtype: int64

3.6检测过滤异常值

  • 注意describe()的应用
np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
print(data.describe())
                 0            1            2            3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean     -0.067684     0.067924     0.025598    -0.002298
std       0.998035     0.992106     1.006835     0.996794
min      -3.428254    -3.548824    -3.184377    -3.745356
25%      -0.774890    -0.591841    -0.641675    -0.644144
50%      -0.116401     0.101143     0.002073    -0.013611
75%       0.616366     0.780282     0.680391     0.654328
max       3.366626     2.653656     3.260383     3.927528
col = data[3]
print(col[np.abs(col)>3])
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64
print(data[(np.abs(data)>3).any(1)]) # 出现绝对值大于3的行
            0         1         2         3
5   -0.539741  0.476985  3.248944 -1.021228
97  -0.774363  0.552936  0.106061  3.927528
102 -0.655054 -0.565230  3.176873  0.959533
305 -2.315555  0.457246 -0.025907 -3.399312
324  0.050188  1.951312  3.260383  0.963301
400  0.146326  0.508391 -0.196713 -3.745356
499 -0.293333 -0.242459 -3.056990  1.918403
523 -3.428254 -0.296336 -0.439938 -0.867165
586  0.275144  1.179227 -3.184377  1.369891
808 -0.362528 -3.548824  1.553205 -2.186301
900  3.366626 -2.372214  0.851010  1.332846

3.7排列和随机采样

  • np.random.permutation():返回随机重排列的index
  • DataFrame.take(上面的index):重新排列
df = DataFrame(np.arange(5*4).reshape(5,4))
sampler = np.random.permutation(len(df))
print(sampler)
[1 0 4 3 2]
print(df)
print(df.take(sampler))
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
    0   1   2   3
1   4   5   6   7
0   0   1   2   3
4  16  17  18  19
3  12  13  14  15
2   8   9  10  11
posted @ 2017-08-22 18:59  AsuraDong  阅读(829)  评论(0编辑  收藏  举报