Python学习笔记:描述性统计describe
一、介绍
data.describe()
即可很方便的输出数据的统计信息。
但还有更详细的使用方法:
DataFrame.descirbe(percentiles=[0.1,0.2,0.5,0.75],
include=None,
exclude=None)
参数解释:
percentiles -- 0-1之间的数字,以返回各自的百分位数
include -- 包含的数据类型
exclude -- 剔除的数据类型
二、实操
- 默认统计量
import pandas as pd
import numpy as np
series = pd.Series(np.random.randn(100))
series.describe()
'''
count 100.000000 计数
mean -0.049944 均值
std 0.967943 标准差
min -2.692278 最小值
25% -0.717809 25%分位数
50% -0.061116 中位数
75% 0.682023 75%分位数
max 1.825730 最大值
dtype: float64
'''
- percentiles参数
series.describe(percentiles=[0.05,0.25,0.3,0.7,0.8])
'''
count 100.000000
mean -0.049944
std 0.967943
min -2.692278
5% -1.617615
25% -0.717809
30% -0.574646
50% -0.061116
70% 0.543954
80% 0.776378
max 1.825730
dtype: float64
'''
- include参数
df = pd.DataFrame({"class":["语文","语文","语文","语文","语文","数学","数学","数学","数学","数学"],
"name":["小明","小苏","小周","小孙","小王","小明","小苏","小周","小孙","小王"],
"score":[137,125,125,115,115,80,111,130,130,140]})
df
# 默认输出数值型特征的统计量
df.describe()
df.descirbe(include=[np.number])
'''
score
count 10.000000
mean 120.800000
std 17.203359
min 80.000000
25% 115.000000
50% 125.000000
75% 130.000000
max 140.000000
'''
# 计算离散型变量的统计特征
df.describe(include=['O'])
df.describe(include=[object])
'''
class name
count 10 10 非空计数
unique 2 5 唯一值
top 数学 小孙 出现最频繁
freq 5 2 频次
'''
# all 输出全部特征
df.describe(include='all')
'''
class name score
count 10 10 10.000000
unique 2 5 NaN
top 数学 小孙 NaN
freq 5 2 NaN
mean NaN NaN 120.800000
std NaN NaN 17.203359
min NaN NaN 80.000000
25% NaN NaN 115.000000
50% NaN NaN 125.000000
75% NaN NaN 130.000000
max NaN NaN 140.000000
'''
- exclude参数
# 剔除统计类型
df.describe(exclude='O')
'''
score
count 10.000000
mean 120.800000
std 17.203359
min 80.000000
25% 115.000000
50% 125.000000
75% 130.000000
max 140.000000
'''