Pandas_groupby分组统计

 import pandas as pd
import numpy as np
 
df = pd.DataFrame({
    'A':['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
    'B':['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
    'C':np.random.randn(8),
    'D':np.random.randn(8)
})
 
# 1）分组使用聚合函数做数据统计
df.groupby('A').sum()　                # 使用聚合函数sum()统计，不是数字列被自动忽略.单列groupby
df.groupby(['A', 'B']).mean()        # 结果分组列变为二级索引,    多个列groupby
df.groupby(['A', 'B'], as_index=False).mean()
 
# 2）同时查看多种数据统计
df.groupby('A').agg([np.sum, np.mean, np.std])      # 结果：列变成了多级索引
 
# 3）查看单列的结果数据统计
# 3.1)方法1 预过滤 效果更好             对c列进行数据统计
df.groupby('A')['C'].agg([np.sum, np.mean, np.std])
 
# 3.2)方法2 最后['C']的意思是取出C这一列 先进行统计再取出：
df.groupby('A').agg([np.sum, np.mean, np.std])['C']
 
# 4)不同列使用不同的聚合函数
df.groupby('A').agg({
	'C':np.sum,
	'D':np.mean
    })
 
# 5)遍历groupby的结果理解执行流程，for循环可以遍历每个group
g = df.groupby('A')
for name,group in g:
	print(name)
	print('--------')
	print(group)
 
# 获取单个分组的数据：g.get_group('bar')
# 遍历多个列聚合的分组：
g = df.groupby(['A', 'B'])
for name, group in g:
	print(name)
	print(group)
	print()
 
# 获取多个分组的数据
g.get_group(('foo', 'one'))
 
# 可以直接查询group后的某几列，生成Series或者子DataFrame：
g['C']
for name,group in g['C']:
    print(name)
    print(group)
    print(type(group))
    print()
 
 
# 可以直接查询group后的某几列，生成Series或者子DataFrame
g['C']
for name,group in g['C']:
    print(name)
    print(group)
    print(type(group))
    print()
		# ('bar', 'one')
		# 1    0.724489
		# Name: C, dtype: float64
		# <class 'pandas.core.series.Series'>
		# ('bar', 'three')
		# 3   -1.076681
		# Name: C, dtype: float64
		# <class 'pandas.core.series.Series'>
		# ('bar', 'two')
		# 5   -0.608787
		# Name: C, dtype: float64
		# <class 'pandas.core.series.Series'>
		# ('foo', 'one')
		# 0    1.156460
		# 6   -0.438206
		# Name: C, dtype: float64
		# <class 'pandas.core.series.Series'>
		# ('foo', 'three')
		# 7    1.028774
		# Name: C, dtype: float64
		# <class 'pandas.core.series.Series'>
		# ('foo', 'two')
		# 2    0.426645
		# 4   -0.809016
		# Name: C, dtype: float64
		# <class 'pandas.core.series.Series'>
# 所有的聚合统计，都是在dataframe和series上进行的
 
# 7）实例分组探索天气数据
file_path = 'data/beijing_tianqi_2018.csv'
df = pd.read_csv(file_path)
# 替换掉温度的后缀 度
df.loc[:, 'bWendu'] = df['bWendu'].str.replace('度', '').astype('int32')
df.loc[:, 'yWendu'] = df['yWendu'].str.replace('度', '').astype('int32')
df.head()
#          ymd  bWendu  yWendu tianqi fengxiang fengli  aqi aqiInfo  aqiLevel
# 0  2018-01-01       3      -6   晴~多云       东北风   1-2级   59       良         2
# 1  2018-01-02      -3       6   阴~多云       东北风   1-2级   49       良         1
# 2  2018-01-03      -6      -6     多云        北风   1-2级   28       优         1
# 新增一列为月份
df['month'] =  df['ymd'].str[:7]
df.head()
#           ymd  bWendu  yWendu tianqi  ... aqi aqiInfo  aqiLevel    month
# 0  2018-01-01       3      -6   晴~多云  ...  59       良         2  2018-01
# 1  2018-01-02      -3       6   阴~多云  ...  49       良         1  2018-01
# 2  2018-01-03      -6      -6     多云  ...  28       优         1  2018-01
# [3 rows x 10 columns]
 
# 7.1)查看每月的最高温度
data = df.groupby('month')['bWendu'].max()
type(data)
# type(data)
# Out[3]: pandas.core.series.Series
data.plot()  # 画图
# Out[4]: <AxesSubplot:xlabel='month'>
 
# 7.2)查看每个月的最高温度，最低温度，平均空气质量指数
df.head()
group_data = df.groupby('month').agg({
    'bWendu':np.max,
    'yWendu':np.min,
    'aqi':np.mean
})
group_data
#          bWendu  yWendu        aqi
# month
# 2018-01       3      -6  45.333333
group_data.plot()

posted @ 2022-03-08 16:42 aall_blue 阅读(76) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· Pandas 新增修改数据 + 数据统计函数 + 根据axis进行删除

· Pandas_数据转换函数map，apply，applumap

· pandas分组统计-groupby详解

· pandas-分组过滤聚合

· Pandas分组聚合

阅读排行：
· winform 绘制太阳，地球，月球运作规律
· AI与.NET技术实操系列（五）：向量存储与相似性搜索在 .NET 中的实现
· 超详细：普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾（3.3-3.9）
· AI 智能体引爆开源社区「GitHub 热点速览」

公告

昵称： aall_blue
园龄： 3年1个月
粉丝： 0
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

随笔分类

随笔档案

2022年3月(15)

aall_blue

Pandas_groupby分组统计

公告

搜索

常用链接

随笔分类

随笔档案

阅读排行榜

	import pandas as pd
	import numpy as np

	df = pd.DataFrame({
	'A':['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
	'B':['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
	'C':np.random.randn(8),
	'D':np.random.randn(8)
	})

	# 1）分组使用聚合函数做数据统计
	df.groupby('A').sum()　 # 使用聚合函数sum()统计，不是数字列被自动忽略.单列groupby
	df.groupby(['A', 'B']).mean() # 结果分组列变为二级索引, 多个列groupby
	df.groupby(['A', 'B'], as_index=False).mean()

	# 2）同时查看多种数据统计
	df.groupby('A').agg([np.sum, np.mean, np.std]) # 结果：列变成了多级索引

	# 3）查看单列的结果数据统计
	# 3.1)方法1 预过滤效果更好对c列进行数据统计
	df.groupby('A')['C'].agg([np.sum, np.mean, np.std])

	# 3.2)方法2 最后['C']的意思是取出C这一列先进行统计再取出：
	df.groupby('A').agg([np.sum, np.mean, np.std])['C']

	# 4)不同列使用不同的聚合函数
	df.groupby('A').agg({
	'C':np.sum,
	'D':np.mean
	})

	# 5)遍历groupby的结果理解执行流程，for循环可以遍历每个group
	g = df.groupby('A')
	for name,group in g:
	print(name)
	print('--------')
	print(group)

	# 获取单个分组的数据：g.get_group('bar')
	# 遍历多个列聚合的分组：
	g = df.groupby(['A', 'B'])
	for name, group in g:
	print(name)
	print(group)
	print()

	# 获取多个分组的数据
	g.get_group(('foo', 'one'))

	# 可以直接查询group后的某几列，生成Series或者子DataFrame：
	g['C']
	for name,group in g['C']:
	print(name)
	print(group)
	print(type(group))
	print()


	# 可以直接查询group后的某几列，生成Series或者子DataFrame
	g['C']
	for name,group in g['C']:
	print(name)
	print(group)
	print(type(group))
	print()
	# ('bar', 'one')
	# 1 0.724489
	# Name: C, dtype: float64
	# <class 'pandas.core.series.Series'>
	# ('bar', 'three')
	# 3 -1.076681
	# Name: C, dtype: float64
	# <class 'pandas.core.series.Series'>
	# ('bar', 'two')
	# 5 -0.608787
	# Name: C, dtype: float64
	# <class 'pandas.core.series.Series'>
	# ('foo', 'one')
	# 0 1.156460
	# 6 -0.438206
	# Name: C, dtype: float64
	# <class 'pandas.core.series.Series'>
	# ('foo', 'three')
	# 7 1.028774
	# Name: C, dtype: float64
	# <class 'pandas.core.series.Series'>
	# ('foo', 'two')
	# 2 0.426645
	# 4 -0.809016
	# Name: C, dtype: float64
	# <class 'pandas.core.series.Series'>
	# 所有的聚合统计，都是在dataframe和series上进行的

	# 7）实例分组探索天气数据
	file_path = 'data/beijing_tianqi_2018.csv'
	df = pd.read_csv(file_path)
	# 替换掉温度的后缀度
	df.loc[:, 'bWendu'] = df['bWendu'].str.replace('度', '').astype('int32')
	df.loc[:, 'yWendu'] = df['yWendu'].str.replace('度', '').astype('int32')
	df.head()
	# ymd bWendu yWendu tianqi fengxiang fengli aqi aqiInfo aqiLevel
	# 0 2018-01-01 3 -6 晴~多云东北风 1-2级 59 良 2
	# 1 2018-01-02 -3 6 阴~多云东北风 1-2级 49 良 1
	# 2 2018-01-03 -6 -6 多云北风 1-2级 28 优 1
	# 新增一列为月份
	df['month'] = df['ymd'].str[:7]
	df.head()
	# ymd bWendu yWendu tianqi ... aqi aqiInfo aqiLevel month
	# 0 2018-01-01 3 -6 晴~多云 ... 59 良 2 2018-01
	# 1 2018-01-02 -3 6 阴~多云 ... 49 良 1 2018-01
	# 2 2018-01-03 -6 -6 多云 ... 28 优 1 2018-01
	# [3 rows x 10 columns]

	# 7.1)查看每月的最高温度
	data = df.groupby('month')['bWendu'].max()
	type(data)
	# type(data)
	# Out[3]: pandas.core.series.Series
	data.plot() # 画图
	# Out[4]: <AxesSubplot:xlabel='month'>

	# 7.2)查看每个月的最高温度，最低温度，平均空气质量指数
	df.head()
	group_data = df.groupby('month').agg({
	'bWendu':np.max,
	'yWendu':np.min,
	'aqi':np.mean
	})
	group_data
	# bWendu yWendu aqi
	# month
	# 2018-01 3 -6 45.333333
	group_data.plot()