直方图及分布曲线
直方图可以大致查看数据分布是否为正态。通常还需要将正态分布的曲线叠加在上面。
导入需要的包
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
定义生成均值为mu,方差为sigma的正态分布对应y值的函数
def norm(x, mu, sigma):
return ((1/(np.sqrt(2*np.pi)*sigma))*np.exp(-0.5*(1/sigma*(x-mu))**2))
生成测试数据
data = np.random.normal(size=100000)
mu = np.mean(data)
sigma = np.std(data)
num_bins = 100
绘制直方图
fig, ax = plt.subplots()
# histogram
n, bins, patches = ax.hist(
data,
bins=num_bins,
density=True,
color='#2792C3')
添加正态分布的曲线
y = norm(bins, mu, sigma)
ax.plot(
bins,
y,
color='#EE827C',
ls='--'
)
添加95%分位数线
tmp_thr = stats.norm.ppf(0.95)
thr = tmp_thr*sigma+mu
y_thr = norm(thr, mu, sigma)
ax.plot(
[thr, thr],
[0, y_thr],
color='#EE827C',
ls='--'
)
用颜色填充对应的区域。其中,zorder用来指定相应组块的层数,即调整与其他组块的重叠关系。
tmp_x = np.linspace(thr, np.max(data), num=100)
tmp_y = norm(tmp_x, mu, sigma)
ax.fill_between(tmp_x, 0, tmp_y, facecolor='#EE827C', zorder=2, alpha=0.5)
设置坐标轴标签并保存图片
ax.set_xlabel('Data')
ax.set_ylabel('Density')
plt.savefig('demo.pdf', bbox_inches='tight')
示例图
完整代码
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def norm(x, mu, sigma):
return ((1/(np.sqrt(2*np.pi)*sigma))*np.exp(-0.5*(1/sigma*(x-mu))**2))
# generate the test data
data = np.random.normal(size=100000)
mu = np.mean(data)
sigma = np.std(data)
num_bins = 100
fig, ax = plt.subplots()
# histogram
n, bins, patches = ax.hist(
data,
bins=num_bins,
density=True,
color='#2792C3')
# normal distribution line
y = norm(bins, mu, sigma)
ax.plot(
bins,
y,
color='#EE827C',
ls='--'
)
# add line of 95% quantile
tmp_thr = stats.norm.ppf(0.95)
thr = tmp_thr*sigma+mu
y_thr = norm(thr, mu, sigma)
ax.plot(
[thr, thr],
[0, y_thr],
color='#EE827C',
ls='--'
)
# fill the area greater than 95% quantile with color
tmp_x = np.linspace(thr, np.max(data), num=100)
tmp_y = norm(tmp_x, mu, sigma)
ax.fill_between(tmp_x, 0, tmp_y, facecolor='#EE827C', zorder=2, alpha=0.5)
ax.set_xlabel('Data')
ax.set_ylabel('Density')
# save fig
plt.savefig('demo.pdf', bbox_inches='tight')