Groupby的用法
import pandas as pd
df = pd.DataFrame({'Country':['China','China', 'India', 'India', 'America', 'Japan', 'China', 'India'],
'Income':[10000, 10000, 5000, 5002, 40000, 50000, 8000, 5000],
'Age':[5000, 4321, 1234, 4010, 250, 250, 4500, 4321]})
df
|
Country |
Income |
Age |
0 |
China |
10000 |
5000 |
1 |
China |
10000 |
4321 |
2 |
India |
5000 |
1234 |
3 |
India |
5002 |
4010 |
4 |
America |
40000 |
250 |
5 |
Japan |
50000 |
250 |
6 |
China |
8000 |
4500 |
7 |
India |
5000 |
4321 |
按单列进行分组
df_gb = df.groupby('Country')
for index, data in df_gb:
print(index)
print(data)
America
Country Income Age
4 America 40000 250
China
Country Income Age
0 China 10000 5000
1 China 10000 4321
6 China 8000 4500
India
Country Income Age
2 India 5000 1234
3 India 5002 4010
7 India 5000 4321
Japan
Country Income Age
5 Japan 50000 250
按多列进行分组
df_gb = df.groupby(['Country', 'Income'])
for (index1, index2), data in df_gb:
print((index1, index2))
print(data)
('America', 40000)
Country Income Age
4 America 40000 250
('China', 8000)
Country Income Age
6 China 8000 4500
('China', 10000)
Country Income Age
0 China 10000 5000
1 China 10000 4321
('India', 5000)
Country Income Age
2 India 5000 1234
7 India 5000 4321
('India', 5002)
Country Income Age
3 India 5002 4010
('Japan', 50000)
Country Income Age
5 Japan 50000 250
agg聚合操作
df_agg = df.groupby('Country').agg(['min', 'mean', 'max'])
print(df_agg)
Income Age
min mean max min mean max
Country
America 40000 40000.000000 40000 250 250.000000 250
China 8000 9333.333333 10000 4321 4607.000000 5000
India 5000 5000.666667 5002 1234 3188.333333 4321
Japan 50000 50000.000000 50000 250 250.000000 250
num_agg = {'Age':['min', 'mean', 'max']}
print(df.groupby('Country').agg(num_agg))
Age
min mean max
Country
America 250 250.000000 250
China 4321 4607.000000 5000
India 1234 3188.333333 4321
Japan 250 250.000000 250
num_agg = {'Age':['min', 'mean', 'max'], 'Income':['min', 'max']}
print(df.groupby('Country').agg(num_agg))
Age Income
min mean max min max
Country
America 250 250.000000 250 40000 40000
China 4321 4607.000000 5000 8000 10000
India 1234 3188.333333 4321 5000 5002
Japan 250 250.000000 250 50000 50000
Apply
df
|
Country |
Income |
Age |
0 |
China |
10000 |
5000 |
1 |
China |
10000 |
4321 |
2 |
India |
5000 |
1234 |
3 |
India |
5002 |
4010 |
4 |
America |
40000 |
250 |
5 |
Japan |
50000 |
250 |
6 |
China |
8000 |
4500 |
7 |
India |
5000 |
4321 |
df['Age'].apply(lambda x:str(x)[0])
0 5
1 4
2 1
3 4
4 2
5 2
6 4
7 4
Name: Age, dtype: object