Data
Stay hungry,Stay foolish!

导航

 

层次化索引

层次化也就是在一个轴上拥有多个索引级别

Series的层次化索引

data=Series(np.random.randn(10),index=[
    ['a','a','a','b','b','b','c','c','d','d'],
    [1,2,3,1,2,3,1,2,2,3]
])
data

a  1    0.965999
   2   -0.271733
   3    0.133910
b  1   -0.806885
   2   -0.622905
   3   -0.355330
c  1   -0.659194
   2   -1.082872
d  2   -0.043984
   3   -1.125324
dtype: float64

# 选取数据子集
data['b']
1   -0.806885
2   -0.622905
3   -0.355330
dtype: float64

data['b':'c'] # 在pandas中顾头也顾尾

b  1   -0.806885
   2   -0.622905
   3   -0.355330
c  1   -0.659194
   2   -1.082872
dtype: float64

data.ix[['b','d']] # 按行索引名称选择
b  1   -0.806885
   2   -0.622905
   3   -0.355330
d  2   -0.043984
   3   -1.125324
dtype: float64

# 在内层中进行选取,选择所有的行索引中的2这一行
data[:,2]
a   -0.271733
b   -0.622905
c   -1.082872
d   -0.043984
dtype: float64

# 层次化索引在数据重塑和基于分组的操作中扮演着重要的角色
# 这个函数会把层次化索引转为DataFrame格式,最外层的行索引作为DataFrame的行索引,内层的索引作为列索引
data.unstack()

	1	          2	          3
a	0.965999	-0.271733	0.133910
b	-0.806885	-0.622905	-0.355330
c	-0.659194	-1.082872	NaN
d	NaN	        -0.043984	-1.125324

# unstack()的逆运算,转回来
data.unstack().stack()

a  1    0.965999
   2   -0.271733
   3    0.133910
b  1   -0.806885
   2   -0.622905
   3   -0.355330
c  1   -0.659194
   2   -1.082872
d  2   -0.043984
   3   -1.125324
dtype: float64

DataFrame的层次化索引

frame = pd.DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],
            columns=[['ohio','ohio','color'],['green','red','green']]
            )
frame

	ohio	color
   green	red	green
a	1	0	1	2
    2	3	4	5
b	1	6	7	8
    2	9	10	11

# 给层级行索引加名字
frame.index.names = ['key1','key2']
# 给层级列索引加名字
frame.columns.names = ['state','color']
frame

   state	ohio	color
   color	green	red	green
key1	key2			
a	1	0	1	2
    2	3	4	5
b	1	6	7	8
    2	9	10	11

frame['ohio']

   color	green	red
key1	key2		
a	1	0	1
    2	3	4
b	1	6	7
    2	9	10

重排分级顺序

frame

state	ohio	color
color	green	red	green
key1	key2			
a	1	0	1	2
    2	3	4	5
b	1	6	7	8
    2	9	10	11

# 这里sortlevel()括号里的0指把key2和key1交换后按key2排序
frame.swaplevel(0,1).sortlevel(0)

	state	ohio	color
color	green	red	green
key2	key1			
1	a	0	1	2
    b	6	7	8
2	a	3	4	5
    b	9	10	11


# 1指按key1排序
frame.swaplevel(0,1).sortlevel(1)

state	ohio	color
color	green	red	green
key2	key1			
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

根据层次索引级别汇总统计

frame

state	ohio	color
color	green	red	green
key1	key2			
a	1	0	1	2
    2	3	4	5
b	1	6	7	8
    2	9	10	11

# 以key2的1和1相加,2和2索引相加
frame.sum(level='key2')

state	ohio	color
color	green	red	green
key2			
1	6	8	10
2	12	14	16

# 以行索引的green索引相加,red没有不做改变
frame.sum(level='color',axis=1)

color	green	red
key1	key2		
a	1	2	1
    2	8	4
b	1	14	7
    2	20	10

使用DataFrame的列

frame1 = pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                     'c':['one','one','one','two','two','two','two'],
                      'd':[0,1,2,0,1,2,3]
                     })

frame1

   a	b	c	d
0	0	7	one	0
1	1	6	one	1
2	2	5	one	2
3	3	4	two	0
4	4	3	two	1
5	5	2	two	2
6	6	1	two	3

#把c/d设置为行索引,默认会删除这两列,如果不想删除,可以吧drop=False开启
frame1.set_index(['c','d'])

        a	b
c	d		
one	0	0	7
    1	1	6
    2	2	5
two	0	3	4
    1	4	3
    2	5	2
    3	6	1


# reset_index会把cd设置为列索引,了解就行
frame2.reset_index()

   index	a	b	c	d
0	0	0	7	one	0
1	1	1	6	one	1
2	2	2	5	one	2
3	3	3	4	two	0
4	4	4	3	two	1
5	5	5	2	two	2
6	6	6	1	two	3
posted on 2018-11-14 19:58  进击中的青年  阅读(855)  评论(0编辑  收藏  举报