DataFrame的索引

#Dataframe既有行索引也有列索引,可以被看做由多个Series组成的字典(共用一个索引)

#索引方法有多种,记住这5种常用的方法即可
#只选择列 / 只选择行 / 选择行和列 /链式选择 / 布尔判断选择
#一,只选择列
# df[列名],选择列的方法只记这这一种即可,其他的都是不常用的,几多了反而混淆
#只选择一列,df[列名]
#选择多列,用列表包含多个列名:df[[列名1,列名2...]]
#选择多列不可以切片:df[列名1:列名5]会报错,如果填入数字会选择行
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                   index = ['one','two','three'],
                   columns = ['a','b','c','d'])

print(df)
l = df['a']
ls = df[['a','c']]
h = df[0:2]  #忘记这个选择行方法
print(data)
print(ls)
print(ls1)
               a          b          c          d
one    44.386955  64.943123  84.604522  35.164263
two    75.446304  55.476815  25.105854  81.424303
three   6.303621  42.431963  68.578739  69.393774
one      44.386955
two      75.446304
three     6.303621
Name: a, dtype: float64
               a          c
one    44.386955  84.604522
two    75.446304  25.105854
three   6.303621  68.578739
             a          b          c          d
one  44.386955  64.943123  84.604522  35.164263
two  75.446304  55.476815  25.105854  81.424303
#二,只选择行loc[]和iloc[]
#只选择一行,loc[行标签],行标签可以是索引数字(没指定行索引名字时,且不能为-1)或名称索引(指定了行索引名字后)
#选择多行,用列表包含多个值,loc[[行标签1,行标签2...]]
#选择多行可以切片:df[行标签1:行标签5],loc包含切片尾部
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)
print(df2)
#单个值
h = df1.loc['one']#指定了行索引以后只能用名称来索引
h1 = df2.loc[0] #没指定名称索引时,默认从0开始的整数
#h2 = df.loc(-1)#报错
print(h)
print(h1)

#多个值
hs = df1.loc[['one','three']]
hs1 = df2.loc[[0,3]]
hs2 = df1.loc['one':'three']
hs3 = df2.loc[0:3] #loc包含切片尾部

print(hs)
print(hs1)
print(hs2)
print(hs3)
#iloc可以理解为正真的位置索引,用法和loc类似不在赘述,区别在于只能用数字索引,值可以为-1,切片不包含尾部
#选择一个和多个不在演示
hs4 = df2.iloc[0:3] #iloc不包含切片尾部
print(hs4) 



               a          b          c          d
one    51.204447  55.528528  58.210314  54.163497
two    41.858473  30.722846  17.749213  90.469865
three  99.200053   3.001227  72.551832  17.683482
four   27.134902  45.250912  28.113455  68.403044
           a          b          c          d
0  87.023917  60.621417  52.059756  77.975245
1  58.333329  14.945754  65.759015  34.399971
2  21.767209  71.009879  68.363179  70.344211
3  56.988215  88.706929  82.538999  34.399141
a    51.204447
b    55.528528
c    58.210314
d    54.163497
Name: one, dtype: float64
a    87.023917
b    60.621417
c    52.059756
d    77.975245
Name: 0, dtype: float64
               a          b          c          d
one    51.204447  55.528528  58.210314  54.163497
three  99.200053   3.001227  72.551832  17.683482
           a          b          c          d
0  87.023917  60.621417  52.059756  77.975245
3  56.988215  88.706929  82.538999  34.399141
               a          b          c          d
one    51.204447  55.528528  58.210314  54.163497
two    41.858473  30.722846  17.749213  90.469865
three  99.200053   3.001227  72.551832  17.683482
           a          b          c          d
0  87.023917  60.621417  52.059756  77.975245
1  58.333329  14.945754  65.759015  34.399971
2  21.767209  71.009879  68.363179  70.344211
3  56.988215  88.706929  82.538999  34.399141
           a          b          c          d
0  87.023917  60.621417  52.059756  77.975245
1  58.333329  14.945754  65.759015  34.399971
2  21.767209  71.009879  68.363179  70.344211
#三,选择行和列loc[选择行,选择列]
#逗号前面是选择行的操作,逗号后面选择列的操作
#具体用法就是把方法一和方法二结合起来,索引可单个,可间断,可切片
lh = df1.loc['one','a']
lhs = df1.loc[['one','three'],['a','c']]
lhs1 = df1.loc['one':'three':1,'a':'c':1] #1是步长,这点和列表的切片一样,单是包含尾部

print(lh)
print(lhs)
print(lhs1)
51.20444650565864
               a          c
one    51.204447  58.210314
three  99.200053  72.551832
               a          b          c
one    51.204447  55.528528  58.210314
two    41.858473  30.722846  17.749213
three  99.200053   3.001227  72.551832
#四,五:链式选择一般和布尔选择配合使用:当选择后的结果还是df对象时还可以继续选择
m_c = df1.loc['one':'three':1,'a':'c':1]>20
print(m_c) #返回True和False
print(df1[m_c]) #返回原表,不符合条件的显示为NaN

res = df1[m_c].iloc[0:2]
res1 = df1[df1.loc['one':'three':1,'a':'c':1]>20].iloc[0:2] #当然你也可以把上面的句子写的看起来稍微复杂点0.0
print(res)
print(res1)
          a      b      c
one    True   True   True
two    True   True  False
three  True  False   True
               a          b          c   d
one    51.204447  55.528528  58.210314 NaN
two    41.858473  30.722846        NaN NaN
three  99.200053        NaN  72.551832 NaN
four         NaN        NaN        NaN NaN
             a          b          c   d
one  51.204447  55.528528  58.210314 NaN
two  41.858473  30.722846        NaN NaN
             a          b          c   d
one  51.204447  55.528528  58.210314 NaN
two  41.858473  30.722846        NaN NaN
posted @ 2020-10-28 00:41  Franciszw  阅读(1666)  评论(0编辑  收藏  举报