pandas-分组过滤聚合
pandas-分组过滤聚合
groupby分组
通过groupby方法对series或者dataframe对象进行分组,该方法会返回一个分组对象
pandas中的groupby函数也是先将df按照某个字段进行拆分,将相同属性分为一组;然后对拆分后的各组执行相应的转换操作;最后输出汇总转换后的各组结果
df.groupby(
by=None,
axis=0,
level=None,
as_index: 'bool' = True,
sort: 'bool' = True,
group_keys: 'bool' = True,
squeeze: 'bool' = <object object at 0x137ac9240>,
observed: 'bool' = False,
dropna: 'bool' = True,
)
返回值 包含有关组信息的groupby对象
groupby的一个属性二个方法:
- groups:返回一个字典类型的对象,包含分组信息。
- size():返回每组记录的数量。
- describe():分组统计信息。
import pandas as pd
import numpy as np
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],'data':[0,5,10,5,10,15,10,15,20]})
print(df)
# key data
# 0 A 0
# 1 B 5
# 2 C 10
# 3 A 5
# 4 B 10
# 5 C 15
# 6 A 10
# 7 B 15
# 8 C 20
# 分组,通过key将数据分成ABC三组
grouped=df.groupby(['key']) #通过key分组
#查看分组情况
for dtype,group in grouped:
print(dtype,group)
# A key data
# 0 A 0
# 3 A 5
# 6 A 10
# B key data
# 1 B 5
# 4 B 10
# 7 B 15
# C key data
# 2 C 10
# 5 C 15
# 8 C 20
dfsum=df.groupby(['key']).sum() # 链式调用先分组再用聚合函数聚合
print(dfsum)
# data
#key
#A 15
#B 30
#C 45
import pandas as pd
import numpy as np
df=pd.DataFrame({'key1':['A','B','C','A','B','C','A','B','C'],
'key2':['one','one','one','two','two','two','three','three','three'],
'data1':[0,5,10,5,10,15,10,15,20],
})
print(df)
# key1 key2 data1
# 0 A one 0
# 1 B one 5
# 2 C one 10
# 3 A two 5
# 4 B two 10
# 5 C two 15
# 6 A three 10
# 7 B three 15
# 8 C three 20
print()
grouped=df.groupby(['key2'])
for gram,data in grouped:
print(gram,data)
# one key1 key2 data1
# 0 A one 0
# 1 B one 5
# 2 C one 10
# three key1 key2 data1
# 6 A three 10
# 7 B three 15
# 8 C three 20
# two key1 key2 data1
# 3 A two 5
# 4 B two 10
# 5 C two 15
import pandas as pd
#数据集
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data':[0,5,10,5,10,15,10,15,20]},
index=[1,2,3,4,5,6,7,8,9])
#自定义函数区分大于5和小于5的数据
def big5(x):
result=0
if x>5:
result=1
return result
by_big5=df.groupby(big5) #根据索引是否大于5进行分组
#查看分组情况
for group_name,group_data in by_big5:
print(group_data)
# key data
# 1 A 0
# 2 B 5
# 3 C 10
# 4 A 5
# 5 B 10
# key data
# 6 C 15
# 7 A 10
# 8 B 15
# 9 C 20
groupby多列分组
import pandas as pd
#数据集
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'bool':[1,1,1,1,2,2,2,2,2],
'data':[0,5,10,5,10,15,10,15,20]
},
index=[1,2,3,4,5,6,7,8,9])
print(df)
# key bool data
# 1 A 1 0
# 2 B 1 5
# 3 C 1 10
# 4 A 1 5
# 5 B 2 10
# 6 C 2 15
# 7 A 2 10
# 8 B 2 15
# 9 C 2 20
grouped=df.groupby(['key','bool'])
print(grouped.sum())
# data
# key bool
# A 1 5
# 2 10
# B 1 5
# 2 25
# C 1 10
# 2 35
每隔n个分组
import pandas as pd
d = pd.DataFrame(list(zip(list(range(1,10)), list(range(11,20)))), columns=['a', 'b'])
print(d)
d=d.assign(c=d.groupby(d.index//3)['b'].transform('sum'))
print(d)
# a b
# 0 1 11
# 1 2 12
# 2 3 13
# 3 4 14
# 4 5 15
# 5 6 16
# 6 7 17
# 7 8 18
# 8 9 19
# a b c
# 0 1 11 36
# 1 2 12 36
# 2 3 13 36
# 3 4 14 45
# 4 5 15 45
# 5 6 16 45
# 6 7 17 54
# 7 8 18 54
# 8 9 19 54
filter过滤
DataFrame.filter(self, items=None, like=None, regex=None, axis=None)
根据分组数据进行过滤
import pandas as pd
#数据集
df=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
'data':[0,5,10,5,10,15,10,15,20]})
def filtersum25(x):
if x['data'].sum()>25:
return True
else:
return False
grouped=df.groupby(['key']) # 先进行分组
for group_name,group_data in grouped:
print(group_data)
# key data
# 0 A 0
# 3 A 5
# 6 A 10
# key data
# 1 B 5
# 4 B 10
# 7 B 15
# key data
# 2 C 10
# 5 C 15
# 8 C 20
print()
filter= grouped.filter(filtersum25) # 再进行分组过滤
print(filter)
# key data
#1 B 5
#2 C 10
#4 B 10
#5 C 15
#7 B 15
#8 C 20
agg整合
df = pd.DataFrame([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[np.nan, np.nan, np.nan]],
columns=['A', 'B', 'C'])
print(df)
# -----return------
# A B C
# 0 1.0 2.0 3.0
# 1 4.0 5.0 6.0
# 2 7.0 8.0 9.0
# 3 NaN NaN NaN
print(df.agg(sum))
# ----return------
# A 12.0
# B 15.0
# C 18.0
# dtype: float64
参考资料
https://zhuanlan.zhihu.com/p/370851569
https://blog.csdn.net/weixin_43790276/article/details/125922102