pandas函数中的Groupby基础知识
Groupby
1.Groupby的基础操作
import pandas as pd
import numpy as np
data = {"Courses":["Numpy","Pandas","Java","Pandas","Python","Python"],"Teacher":["Jack","Lemon","nan","Lemon","Peter","Lemon"],
"Fee":[3000,4500,5000,4000,4900,5000],"Duration":["45days","30days","55days","20days","60days","35days"],"Discount":[200.0,
300.0,100.0,120.0,200.0,"nan"]}
#空值必须是字符型的!需要仔细琢磨
df= pd.DataFrame(data)
# 单列聚合
g = df.groupby("Courses")
#size()
g.size()#groupby对象size函数会列出分组中每组数量的个数
"""
Courses
Java 1
Numpy 1
Pandas 2
Python 2
dtype: int64
"""
#g.describe()
g.describe().round(2)#describe函数会将分组后数据列的统计信息列出来
Java 1.0 5000.0 NaN 5000.0 5000.0 5000.0 5000.0 5000.0
Numpy 1.0 3000.0 NaN 3000.0 3000.0 3000.0 3000.0 3000.0
Pandas 2.0 4250.0 353.55 4000.0 4125.0 4250.0 4375.0 4500.0
Python 2.0 4950.0 70.71 4900.0 4925.0 4950.0 4975.0 5000.0
## groupby可以按某列进行分组,
df.groupby("Courses").mean()#按照"Courses"进行分组并求平均值
#需要注意的是,求均值只针对整形,浮点数等数值类型,如果包括字符串等其他类型,会有警告提示!此时可以通过设置“numeric_only=True”调整
df.groupby("Courses").mean(numeric_only=True)
# 多列聚合
g = df.groupby(["Courses","Teacher"]).mean(numeric_only=True)#按照"Courses","Teacher"两列进行分组,然后求平均值
Java nan 5000.0
Numpy Jack 3000.0
Pandas Lemon 4250.0
Python Lemon 5000.0
Peter 4900.0
#as_index
# 当设置as_index = False时,分组列将是数据框的列,也就是将重复的行进行合并
df.groupby("Courses",as_index=False).mean()
#get_group
#通过get_group函数可以将group对象的某组提取出来
df.groupby("Courses").get_group("Pandas")
"""
1 Pandas Lemon 4500 30days 300.0
3 Pandas Lemon 4000 20days 120.0
"""
#对于多列的group对象,也是可以将某个具体的分组提取出来的
df.groupby(["Courses","Teacher"]).get_group(("Python","Lemon"))
"""
5 Python Lemon 5000 35days nan
"""
#axis=1
#当axis=1时沿1轴方向进行分组,列表中“lst”中具有相同元素值的分为同一组,因此这里的结果分为“A”,“B”两组,然后每组各自求和
data2 = {"a":[80,90,60,73,89],"b":[80,75,80,85,83],"c":[70,75,80,73,62]}
df2 = pd.DataFrame(data=data2,index=list("efxyz"))
lst = ["A","B","B"]
df2.groupby(lst,axis=1).sum()
"""
e 80 150
f 90 150
x 60 160
y 73 158
z 89 145
"""
记录学习的点点滴滴