pandas之分组

import numpy as np
import pandas as pd

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
print(df)

# data1列数据,按照key1来进行分组
g1 = df['data1'].groupby(by=df['key1'])
print(g1)
print(g1.mean())

# 一次传入多个数组,按照key1,key2分组计算
g2 = df['data1'].groupby(by=[df['key1'], df['key2']])
print(g2.sum())

# 可以将列名(可以是字符串、数字或者其他python对象)用作分组
g3 = df.groupby(by='key1')
print(g3.mean())

g4 = df.groupby(by=['key1', 'key2'])
print(g4.mean())

输出结果:
  key1 key2     data1     data2
0    a  one  0.374982  1.064534
1    a  two -0.485860  0.071513
2    b  one -0.741052  1.868335
3    b  two -0.757161 -0.536510
4    a  one -0.092603 -1.647309
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001720F810BA8>
key1
a   -0.067827
b   -0.749106
Name: data1, dtype: float64
key1  key2
a     one     0.282379
      two    -0.485860
b     one    -0.741052
      two    -0.757161
Name: data1, dtype: float64
         data1     data2
key1                    
a    -0.067827 -0.170421
b    -0.749106  0.665912
              data1     data2
key1 key2                    
a    one   0.141189 -0.291387
     two  -0.485860  0.071513
b    one  -0.741052  1.868335
     two  -0.757161 -0.536510
import numpy as np
import pandas as pd

df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
print(df)

states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
# 按照states, years传入数组的方式对data1列数据分组后进行平均值计算
g1 = df['data1'].groupby(by=[states, years])
print(g1.mean())

people = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
print(people)
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red'}
g2 = people.groupby(mapping, axis=1).sum()
print(g2)


输出结果:
  key1 key2     data1     data2
0    a  one -0.388889  0.083999
1    a  two  0.109513  2.016369
2    b  one -0.423841  0.800895
3    b  two -0.899408  0.039836
4    a  one -0.137741 -1.382602
California  2005    0.109513
            2006   -0.423841
Ohio        2005   -0.644148
            2006   -0.137741
Name: data1, dtype: float64
               a         b         c         d         e
Joe     1.179284  0.289269  0.303566  1.493424 -1.432677
Steve  -0.386514 -0.154331 -0.771970 -0.597251  0.004637
Wes    -0.933249  0.300309 -1.659948  0.404844 -0.375680
Jim    -2.427930  0.574752  0.504091 -0.996975 -1.547615
Travis -1.555477 -1.065392 -1.149300  1.326839 -1.530914
            blue       red
Joe     1.796990  0.035876
Steve  -1.369220 -0.536208
Wes    -1.255104 -1.008621
Jim    -0.492884 -3.400793
Travis  0.177539 -4.151784

 

import numpy as np
import pandas as pd

df = pd.DataFrame({
    'A': [1, 1, 2, 2],
    'B': [1, 2, 3, 4],
    'C': np.random.randn(4)})

print(df)
# 按照A列分组后聚合求最小值
df1 = df.groupby('A').agg('min')
print(df1)

# 按照A列分组后聚合求最小值和最大值
df2 = df.groupby('A').agg(['min', 'max'])
print(df2)

# 按照A列分组后聚合后对B列求最小值和最大值,同时对C列求和
df3 = df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
print(df3)

输出结果:
   A  B         C
0  1  1  1.630997
1  1  2 -0.238686
2  2  3  0.841689
3  2  4 -0.532344
   B         C
A             
1  1 -0.238686
2  3 -0.532344
    B             C          
  min max       min       max
A                            
1   1   2 -0.238686  1.630997
2   3   4 -0.532344  0.841689
    B             C
  min max       sum
A                  
1   1   2  1.392311
2   3   4  0.309345

 

import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randn(4, 5), columns=list('ABCDE'))
print(df)

# 计算各列数据总和并作为新列添加到末尾
df['col_sum'] = df.apply(lambda x: x.sum(), axis=1)
print(df)
# 计算各行数据总和并作为新行添加到末尾
df.loc['row_sum'] = df.apply(lambda x: x.sum(), axis=0)
print(df)



输出结果:
          A         B         C         D         E
0 -0.978040 -0.820454  0.673568  1.613058 -0.203536
1  0.879338  0.561962  0.278855 -0.481307  1.097682
2 -1.244722  1.304386  0.206882 -0.715907 -1.246526
3  1.419993  0.453622  0.237463  1.292601 -0.184714
          A         B         C         D         E   col_sum
0 -0.978040 -0.820454  0.673568  1.613058 -0.203536  0.284595
1  0.879338  0.561962  0.278855 -0.481307  1.097682  2.336530
2 -1.244722  1.304386  0.206882 -0.715907 -1.246526 -1.695887
3  1.419993  0.453622  0.237463  1.292601 -0.184714  3.218965
                A         B         C         D         E   col_sum
0       -0.978040 -0.820454  0.673568  1.613058 -0.203536  0.284595
1        0.879338  0.561962  0.278855 -0.481307  1.097682  2.336530
2       -1.244722  1.304386  0.206882 -0.715907 -1.246526 -1.695887
3        1.419993  0.453622  0.237463  1.292601 -0.184714  3.218965
row_sum  0.076569  1.499516  1.396768  1.708445 -0.537094  4.144204

 

posted @ 2019-07-29 21:00  一如年少模样  阅读(801)  评论(0编辑  收藏  举报