python进行数据分析groupby基础操作
from pandas import Series,DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
grouped=df['data1'].groupby(df['key1'])
grouped.mean()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
Out[10]:
key1 key2
a one -0.230076
two 1.248653
b one -0.196613
two 0.689761
Name: data1, dtype: float64
means.unstack(level=-1)
Out[12]:
key2 one two
key1
a -0.230076 1.248653
b -0.196613 0.689761
means.unstack(level=0)
Out[13]:
key1 a b
key2
one -0.230076 -0.196613
two 1.248653 0.689761
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
people = DataFrame(np.random.randn(5, 5),columns=['a', 'b', 'c', 'd', 'e'],index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
mapping
Out[25]: {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}
people
Out[26]:
a b c d e
Joe -0.676828 0.880429 0.386540 2.961741 1.651430
Steve 0.269119 0.916097 -0.238961 0.949983 -0.697962
Wes 0.078493 -0.120419 -1.275525 -0.289106 -0.956001
Jim -1.225288 0.120219 -1.173347 -0.792199 -0.417982
Travis 0.728367 0.906879 0.373102 -1.151590 -1.883083
people.groupby(mapping,axis=1).sum()
Out[29]:
blue red
Joe 3.348280 1.855032
Steve 0.711022 0.487254
Wes -1.564631 -0.997927
Jim -1.965546 -1.523050
Travis -0.778488 -0.247836
people.groupby(len).sum()
Out[30]:
a b c d e
3 -1.823623 0.880229 -2.062331 1.880435 0.277447
5 0.269119 0.916097 -0.238961 0.949983 -0.697962
6 0.728367 0.906879 0.373102 -1.151590 -1.883083
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
columns
Out[35]:
MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df.groupby(level='cty',axis=1).count()
Out[37]:
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df
Out[16]:
data1 data2 key1 key2
0 -0.169761 -0.297803 a one
1 1.248653 0.116745 a two
2 -0.196613 -0.224198 b one
3 0.689761 1.436277 b two
4 -0.290392 1.292000 a one
df['data1'].groupby([states,years]).mean()
Out[17]:
California 2005 1.248653
2006 -0.196613
Ohio 2005 0.260000
2006 -0.290392
Name: data1, dtype: float64
#被聚合的只有数值列
df.groupby(df['key1']).mean()
Out[19]:
data1 data2
key1
a 0.262833 0.370314
b 0.246574 0.606039
df.groupby(['key1','key2']).mean()
Out[20]:
data1 data2
key1 key2
a one -0.230076 0.497098
two 1.248653 0.116745
b one -0.196613 -0.224198
two 0.689761 1.436277
for name,group in df.groupby(['key1']):
print (name)
print(group)
a
data1 data2 key1 key2
0 -0.169761 -0.297803 a one
1 1.248653 0.116745 a two
4 -0.290392 1.292000 a one
b
data1 data2 key1 key2
2 -0.196613 -0.224198 b one
3 0.689761 1.436277 b two
for (k1,k2),group in df.groupby(['key1','key2']):
print (k1,k2)
print(group)
a one
data1 data2 key1 key2
0 -0.169761 -0.297803 a one
4 -0.290392 1.292000 a one
a two
data1 data2 key1 key2
1 1.248653 0.116745 a two
b one
data1 data2 key1 key2
2 -0.196613 -0.224198 b one
b two
data1 data2 key1 key2
3 0.689761 1.436277 b two
df.dtypes
Out[29]:
data1 float64
data2 float64
key1 object
key2 object
dtype: object
grouped=df.groupby(df.dtypes,axis=1)
dict(list(grouped))
Out[31]:
{dtype('float64'): data1 data2
0 -0.169761 -0.297803
1 1.248653 0.116745
2 -0.196613 -0.224198
3 0.689761 1.436277
4 -0.290392 1.292000, dtype('O'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one}
df.groupby(['key1', 'key2'])[['data2']].mean()
Out[15]:
data2
key1 key2
a one 0.942926
two -0.671692
b one 0.366026
two -0.794155
df.ix[2:3,['data1','data2']]=np.nan
df
Out[22]:
data1 data2 key1 key2
0 -1.090066 0.132120 a one
1 -0.683913 -0.671692 a two
2 NaN NaN b one
3 NaN NaN b two
4 -0.003452 1.753732 a one