直方图及分布曲线

直方图可以大致查看数据分布是否为正态。通常还需要将正态分布的曲线叠加在上面。

导入需要的包

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

定义生成均值为mu,方差为sigma的正态分布对应y值的函数

def norm(x, mu, sigma):
    return ((1/(np.sqrt(2*np.pi)*sigma))*np.exp(-0.5*(1/sigma*(x-mu))**2))

生成测试数据

data = np.random.normal(size=100000)
mu = np.mean(data)
sigma = np.std(data)
num_bins = 100

绘制直方图

fig, ax = plt.subplots()
# histogram
n, bins, patches = ax.hist(
                    data, 
                    bins=num_bins, 
                    density=True,
                    color='#2792C3')

添加正态分布的曲线

y = norm(bins, mu, sigma)
ax.plot(
    bins, 
    y,
    color='#EE827C',
    ls='--'
)

添加95%分位数线

tmp_thr = stats.norm.ppf(0.95)
thr = tmp_thr*sigma+mu
y_thr = norm(thr, mu, sigma)
ax.plot(
    [thr, thr],
    [0, y_thr],
    color='#EE827C',
    ls='--'
)

用颜色填充对应的区域。其中,zorder用来指定相应组块的层数,即调整与其他组块的重叠关系。

tmp_x = np.linspace(thr, np.max(data), num=100)
tmp_y = norm(tmp_x, mu, sigma)
ax.fill_between(tmp_x, 0, tmp_y, facecolor='#EE827C', zorder=2, alpha=0.5)

设置坐标轴标签并保存图片

ax.set_xlabel('Data')
ax.set_ylabel('Density')

plt.savefig('demo.pdf', bbox_inches='tight')

示例图

完整代码

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def norm(x, mu, sigma):
    return ((1/(np.sqrt(2*np.pi)*sigma))*np.exp(-0.5*(1/sigma*(x-mu))**2))

# generate the test data
data = np.random.normal(size=100000)
mu = np.mean(data)
sigma = np.std(data)
num_bins = 100

fig, ax = plt.subplots()
# histogram
n, bins, patches = ax.hist(
                    data, 
                    bins=num_bins, 
                    density=True,
                    color='#2792C3')

# normal distribution line
y = norm(bins, mu, sigma)
ax.plot(
    bins, 
    y,
    color='#EE827C',
    ls='--'
)

# add line of 95% quantile
tmp_thr = stats.norm.ppf(0.95)
thr = tmp_thr*sigma+mu
y_thr = norm(thr, mu, sigma)
ax.plot(
    [thr, thr],
    [0, y_thr],
    color='#EE827C',
    ls='--'
)

# fill the area greater than 95% quantile with color
tmp_x = np.linspace(thr, np.max(data), num=100)
tmp_y = norm(tmp_x, mu, sigma)
ax.fill_between(tmp_x, 0, tmp_y, facecolor='#EE827C', zorder=2, alpha=0.5)

ax.set_xlabel('Data')
ax.set_ylabel('Density')

# save fig
plt.savefig('demo.pdf', bbox_inches='tight')
posted @ 2020-10-23 17:47  海拉鲁捡垃圾  阅读(1702)  评论(0编辑  收藏  举报