pandas函数中的Groupby基础知识

Groupby

1.Groupby的基础操作

import pandas as pd
import numpy as np

data = {"Courses":["Numpy","Pandas","Java","Pandas","Python","Python"],"Teacher":["Jack","Lemon","nan","Lemon","Peter","Lemon"],
       "Fee":[3000,4500,5000,4000,4900,5000],"Duration":["45days","30days","55days","20days","60days","35days"],"Discount":[200.0,
        300.0,100.0,120.0,200.0,"nan"]}
#空值必须是字符型的!需要仔细琢磨
df= pd.DataFrame(data)

# 单列聚合


g = df.groupby("Courses")

#size()
g.size()#groupby对象size函数会列出分组中每组数量的个数
"""
Courses
Java      1
Numpy     1
Pandas    2
Python    2
dtype: int64
"""

#g.describe()
g.describe().round(2)#describe函数会将分组后数据列的统计信息列出来

Java	1.0	5000.0	NaN	5000.0	5000.0	5000.0	5000.0	5000.0
Numpy	1.0	3000.0	NaN	3000.0	3000.0	3000.0	3000.0	3000.0
Pandas	2.0	4250.0	353.55	4000.0	4125.0	4250.0	4375.0	4500.0
Python	2.0	4950.0	70.71	4900.0	4925.0	4950.0	4975.0	5000.0


## groupby可以按某列进行分组,
df.groupby("Courses").mean()#按照"Courses"进行分组并求平均值

#需要注意的是,求均值只针对整形,浮点数等数值类型,如果包括字符串等其他类型,会有警告提示!此时可以通过设置“numeric_only=True”调整
df.groupby("Courses").mean(numeric_only=True)


# 多列聚合
g = df.groupby(["Courses","Teacher"]).mean(numeric_only=True)#按照"Courses","Teacher"两列进行分组,然后求平均值

Java	nan	5000.0
Numpy	Jack	3000.0
Pandas	Lemon	4250.0
Python	Lemon	5000.0
Peter	4900.0

#as_index
# 当设置as_index = False时,分组列将是数据框的列,也就是将重复的行进行合并
df.groupby("Courses",as_index=False).mean()

#get_group
#通过get_group函数可以将group对象的某组提取出来
df.groupby("Courses").get_group("Pandas")
"""

1	Pandas	Lemon	4500	30days	300.0
3	Pandas	Lemon	4000	20days	120.0
"""

#对于多列的group对象,也是可以将某个具体的分组提取出来的
df.groupby(["Courses","Teacher"]).get_group(("Python","Lemon"))
"""
5	Python	Lemon	5000	35days	nan
"""

#axis=1
#当axis=1时沿1轴方向进行分组,列表中“lst”中具有相同元素值的分为同一组,因此这里的结果分为“A”,“B”两组,然后每组各自求和
data2 = {"a":[80,90,60,73,89],"b":[80,75,80,85,83],"c":[70,75,80,73,62]}
df2 = pd.DataFrame(data=data2,index=list("efxyz"))
lst = ["A","B","B"]
df2.groupby(lst,axis=1).sum()
"""
e	80	150
f	90	150
x	60	160
y	73	158
z	89	145
"""

posted @ 2023-03-11 22:43  小杨的冥想课  阅读(83)  评论(0编辑  收藏  举报