PRML-第1章 绪论 总结
一些数学符号
\(N\)
样本量
\(x_n\)
每一个数据点,或者叫样本点,工程中/训练集中的x_train
\(t_n\)
训练集中的y_train
CDF
累计概率分布
PDF
概率密度函数
\(\mathcal{D}\)
观测
\(N个输入\),x_train
\(N个输出\),y_train
\(1个输入,D是维度\),x_test
\(1个输出\),y_test
\(w\)
自然参数
1.1 多项式拟合
\(y(x,w)=w_0+w_1x+w_2x^2+...+w_Mx^M=\sum_{j=0}^M{w_jx^j}\)\(\tag{1.1}\)
\(多项式函数拟合函数\)
\(E(\boldsymbol{w})=\frac{1}{2} \sum_{n=1}^{N}\left\{y\left(x_{n}, \boldsymbol{w}\right)-t_{n}\right\}^{2}\)\(\tag{1.2}\)
误差函数 Loss Function,此处是平方和误差
\(E_{R M S}=\sqrt{2 E\left(\boldsymbol{w}^{*}\right) / N}\) \(\tag{1.3}\)
均方根误差,为了比较不同大小的数据集和保证t有相同单位。引入均方根误差
\(\tilde{E}(\boldsymbol{w})=\frac{1}{2} \sum_{n=1}^{N}\left\{y\left(x_{n}, \boldsymbol{w}\right)-t_{n}\right\}^{2}+\frac{\lambda}{2}\|\boldsymbol{w}\|^{2}\)$\tag{1.4}
正则化
\(\|\boldsymbol{w}\|^{2}\equiv=\boldsymbol{w}_0^2+\boldsymbol{w}_1^2+...+\boldsymbol{w}_M^2\),\(\lambda\)的大小控制的正则化影响的大小
\(w=(X^TX)^{−1}X^TY,或者按照本书写法应该是w=(X^TX)^{−1}X^Tt\)
最小二乘法解析解
\(w=(X^TX+\lambda I)^{−1}X^Tt\)
最小二乘法正则化后的解析解,\((X^TX+\lambda I)w=X^Tt,也可以按照解方程的思路求解\)
多项式拟合的python实现
点击查看代码
# preparation
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from prml.preprocess import PolynomialFeature
from prml.linear import (
LinearRegression,
RidgeRegression,
BayesianRegression
)
np.random.seed(1234)
def create_toy_data(func, sample_size, std):
'''
产生包含随机高斯噪声的的样本
:param func:
:param sample_size:
:param std:
:return: x:
'''
x = np.linspace(0, 1, sample_size)
t = func(x) + np.random.normal(scale=std, size=x.shape) # 产生目标函数的采样,添加高斯分布的噪声,loc 均值,std标准差
# 这个np.random.normal 产生的随机数每次都一样
return x, t
def func(x):
'''
目标拟合函数
:param x:
:return:
'''
return np.sin(2 * np.pi * x)
x_train, y_train = create_toy_data(func, 10, 0.25)
x_test = np.linspace(0, 1, 100)
y_test = func(x_test)
# 原目标函数展示
# plt.scatter(x_train, y_train, facecolor="none", edgecolor="b", s=50, label="training data")
# plt.plot(x_test, y_test, c="g", label="$\sin(2\pi x)$")
# plt.legend()
# plt.show()
print("x_train =", x_train)
# 几个多项式拟合
for i, degree in enumerate([0, 1, 3, 9]):
print(i, degree)
plt.subplot(2, 2, i + 1)
feature = PolynomialFeature(degree)
X_train = feature.transform(x_train) # 输入样本x 做多项式特征处理
print("X_train.shape=", X_train.shape)
X_test = feature.transform(x_test)
print("X_test.shape=", X_test.shape)
model = LinearRegression()
model.fit(X_train, y_train)
y, y_std = model.predict(X_test, True)
print("y=", y)
print("y_std", y_std) # y_std 是X_train 算出来的,和X_test无关
plt.scatter(x_train, y_train, facecolor="none", edgecolor="b", s=50, label="training data")
plt.plot(x_test, y_test, c="g", label="$\sin(2\pi x)$")
plt.plot(x_test, y, c="r", label="fitting")
plt.ylim(-1.5, 1.5)
plt.annotate("M={}".format(degree), xy=(0.8, 1))
plt.legend(bbox_to_anchor=(1.05, 0.64), loc=0, borderaxespad=0.)
plt.show()
点击查看代码
import itertools
import functools
import numpy as np
class PolynomialFeature(object):
"""
多项式组合
将输入的数组变成多项式组合
Example
=======
x =
[[a, b],
[c, d]]
y = PolynomialFeatures(degree=2).transform(x)
y =
[[1, a, b, a^2, a * b, b^2],
[1, c, d, c^2, c * d, d^2]]
"""
def __init__(self, degree=2):
"""
构建多项式特征
参数
----------
degree : int
多项式的阶数
"""
assert isinstance(degree, int) # 判断 degree 是不是int类型
self.degree = degree
def transform(self, x):
"""
将输入的数组变成多项式组合
参数
----------
x : (sample_size, n) ndarray
input array
Returns
-------
output : (sample_size, 1 + nC1 + ... + nCd) ndarray
polynomial features
"""
if x.ndim == 1: # ndim 矩阵x的维度,是几维矩阵
x = x[:, None]
x_t = x.transpose() # 转置
features = [np.ones(len(x))] # 以x的形状做全1 矩阵
for degree in range(1, self.degree + 1):
for items in itertools.combinations_with_replacement(x_t,
degree): # 排列组合 这方法允许 元素中的数据重复出现 https://blog.csdn.net/weixin_43866211/article/details/101758931
features.append(functools.reduce(lambda x, y: x * y, items))
return np.asarray(features).transpose()
点击查看代码
import numpy as np
from prml.linear.regression import Regression
class LinearRegression(Regression):
"""
线性回归模型
y = X @ w - 模型公式
t ~ N(t|X @ w, var) - 噪声服从正态分布
"""
def fit(self, X: np.ndarray, t: np.ndarray):
"""
执行最小二乘拟合
参数
----------
X : (N, D) np.ndarray
训练集的观测数据
t : (N,) np.ndarray
训练集的目标数据
"""
# 直接得到w的解析解
self.w = np.linalg.pinv(X) @ t # @ 运算是矩阵相乘,若是向量就是內积,见下面的main函数
# np.linalg.pinv 是求伪逆,公式为 pinv(X) = (X^TX)^{−1}X^T
self.var = np.mean(np.square(X @ self.w - t)) # 均方误差 (X^Tw -t)^2
def predict(self, X: np.ndarray, return_std: bool = False):
"""
Given input的情况下做预测
Parameters
----------
X : (N, D) np.ndarray
预测的输入 是一组输入,N是输入的数据数量,D是多项式的阶数
return_std : bool, optional
是否需要返回均方误差
Returns
-------
y : (N,) np.ndarray
预测值
y_std : (N,) number
均方误差
"""
y = X @ self.w
if return_std:
# y_std = np.sqrt(self.var) + np.zeros_like(y)
y_std = np.sqrt(self.var) # 这里的var是X_train 算出来的均方误差
return y, y_std
return y
if __name__ == '__main__':
X = np.array([1, 2, 3, 4])
w = np.array([6, 7, 8, 9])
print(X @ w)
X = np.array([[1, 2], [1, 2]])
w = np.array([[6, 7], [6, 7]])
print(X @ w)
点击查看代码
class Regression(object):
"""
Base class for regressors
"""
pass
均方根误差 python
点击查看代码
def rmse(a, b):
return np.sqrt(np.mean(np.square(a - b)))
training_errors = []
test_errors = []
for i in range(10):
feature = PolynomialFeature(i)
X_train = feature.transform(x_train)
X_test = feature.transform(x_test)
model = LinearRegression()
model.fit(X_train, y_train)
y = model.predict(X_test)
training_errors.append(rmse(model.predict(X_train), y_train))
test_errors.append(rmse(model.predict(X_test), y_test + np.random.normal(scale=0.25, size=len(y_test))))
plt.plot(training_errors, 'o-', mfc="none", mec="b", ms=10, c="b", label="Training")
plt.plot(test_errors, 'o-', mfc="none", mec="r", ms=10, c="r", label="Test")
plt.legend()
plt.xlabel("degree")
plt.ylabel("RMSE")
plt.show()
正则化python实现
点击查看代码
feature = PolynomialFeature(9)
X_train = feature.transform(x_train)
X_test = feature.transform(x_test)
model = RidgeRegression(alpha=1e-3)
model.fit(X_train, y_train)
y = model.predict(X_test)
plt.scatter(x_train, y_train, facecolor="none", edgecolor="b", s=50, label="training data")
plt.plot(x_test, y_test, c="g", label="$\sin(2\pi x)$")
plt.plot(x_test, y, c="r", label="fitting")
plt.ylim(-1.5, 1.5)
plt.legend()
plt.annotate("M=9", xy=(-0.15, 1))
plt.show()
1.2概率论
\(n_{ij}\)
N次取样,每次得到一组的x,y。\(X=x_{i}, Y=y_{i}\)的取到次数是\(n_{ij}\)
\(p\left(X=x_{i}, Y=y_{i}\right)=\frac{n_{i j}}{N} \tag{1.5}\) \(p\left(X=x_{i}\right)=\frac{c_{i}}{N} \tag{1.6}\)
基本公式
\(p\left(X=x_{i}\right)\)
边缘概率
\(p\left(Y=y_{j}|X=x_{i}\right)=\frac{n_{ij}}{c_i} \tag{1.8}\)
条件概率
\(p\left(X=x_{i}\right)=\sum_{j=1}^{L}p\left(X=x_{i}, Y=y_{i}\right) \tag{1.7}\)\(\color{red}{p(X)=\sum_{Y}p(X,Y) \tag{1.10}}\)\(\color{red}{p(X)=\int_{Y}p(X,Y)dy \tag{1.31}}\)
\(加法法则\)
|\(p\left(X=x_{i}, Y=y_{j}\right)=\frac{n_{i j}}{N}=\frac{n_{i j}}{c_{i}} \cdot \frac{c_{i}}{N}=p\left(Y=y_{j} | X=x_{i}\right) p\left(X=x_{i}\right) \tag{1.9}\)\(\color{red}{p(X,Y)=P(Y\|X)p(X)} \tag{1.11}\)
乘积法则
\(\color{red}{P(Y | X)=\frac{P(X \| Y) P(Y)}{P(X)}} \tag{1.12}\)\(\color{red}{P(Y | X)=\frac{P(X \| Y) P(Y)}{\sum_{Y}P\left(X | Y\right)P\left(Y\right)}} \tag{1.12}\)\(\color{red}{P(Y | X)=\frac{P(X | Y) P(Y)}{\int_{Y}P\left(X | Y\right)P\left(Y\right)dy}} \tag{1.12}\)
贝叶斯定理
\(P(X)=\sum_{Y}P\left(X | Y\right)P\left(Y\right)\\P(X)=\int_{Y}P\left(X | Y\right)P\left(Y\right)dy\)
归一化常数
\(P(Y)\)
先验概率
\(P(Y | X)\)
后验概率
\(p_{y}(y)=p_{x}(x)||\frac{\mathrm{d} x}{\mathrm{d} y}\|=p_{x}(g(y))\|g^{\prime}(y)\| \tag{1.27}\)
Jacobian因子变化(概率形式变换),有点像复合函数,只不过这个函数是个概率密度函数,推导见其他文章
$ \mathbb{E}[f] = \sum\limits_xp(x)f(x) \tag{1.33} $
离散函数的期望
$ \mathbb{E}[f] = \int p(x)f(x)dx \tag{1.34} $
连续函数的期望
$ \mathbb{E}[f] \simeq \frac{1}{N}\sum\limits_{n=1}^{N}f(x_n) \tag{1.35} $
大数定理
$ \mathbb{E}_x[f(x, y)] \tag{1.36} $
多变量下对x变量求期望
$ \mathbb{E}[f|y] = \sum\limits_x p(x|y)f(x) \tag{1.37} $
条件期望
$ var[f] = \mathbb{E}[(f(x) - \mathbb{E}[f(x)])^2] \tag{1.38} $$ var[f] = \mathbb{E}[f(x)^2] − \mathbb{E}[f(x)]^2 \tag{1.39} $$ var[x] = \mathbb{E}[x^2] − \mathbb{E}[x]^2 \tag{1.40} $
方差,第三个概率论书上都有证明
$ \begin{eqnarray} cov[x, y] &=& \mathbb{E}{x,y}[{x − \mathbb{E}[x]} {y − \mathbb{E}[y]}] \ &=& \mathbb{E}{x,y}[xy] − \mathbb{E}[x]\mathbb{E}[y] \tag{1.41} \end{eqnarray} $$ \begin{eqnarray} cov[x, y] &=& \mathbb{E}{x,y}[{x − \mathbb{E}[x]} {y^T − \mathbb{E}[y^T]}] \ &=& \mathbb{E}{x,y}[xy^T] − \mathbb{E}[x]\mathbb{E}[y^T] \tag{1.42} \end{eqnarray} $
协方差,第二个是向量表示
1.23 贝叶斯概率
\(p(D|w)\)
似然函数
\(p(w|D)\)
后验
\(p(w)\)
先验
$ p(D) = \int p(D|w)P(w)dw \tag{1.45} $$ p(D) = \sum_w p(D|w)P(w) $
归一化因子
$ p(w| D) = \frac{p(D|w)p(w)}{p(D)} \tag{1.43} $
贝叶斯定理
$ \text{posterior} \propto \text{likelihood} × \text{prior} \tag{1.44} $
三者关系
频率派:w是固定的参数。最小误差函数就是最大似然估计
贝叶斯派:w不是固定的,需要用概率分布表达这种不确定性。
1.24 高斯分布
\(\mathcal{N}\left(x | \mu, \sigma^{2}\right)=\frac{1}{\left(2 \pi \sigma^{2}\right)^{\frac{1}{2}}} \exp \left\{-\frac{1}{2 \sigma^{2}}(x-\mu)^{2}\right\} \tag{1.46}\)
一元变量的高斯分布
\(\mathcal{N}(x|\mu, \Sigma) = \frac{1}{(2\pi)^{D/2}} \frac{1}{\|\Sigma\|^{1/2}} exp{-\frac{1}{2}(x-\mu)^T\Sigma^{-1}(x - \mu)} \tag{1.52}\)
D维高斯分布
\(\Sigma\)
\(D\times D\)维协方差矩阵
$ \mathbb{E}[x] = \int_{-\infty}^{\infty} \mathcal{N}(x|\mu, \sigma^2)xdx = \mu \tag{1.49}$
高斯分布的均值-一阶矩
\(\mathbb{E}[x^2] = \int_{-\infty}^{\infty} \mathcal{N}(x|\mu, \sigma^2)x^2dx = \mu^2 + \sigma^2 \tag{1.50}\)
二阶矩
$ var[x] = \mathbb{E}[x^2] - \mathbb{E}[x]^2 = \sigma^2 \tag{1.51} $
方差
样本均值:\(\mu_{ML} = \frac{1}{N}\sum\limits_{n=1}^{N}x_n \tag{1.55}\)
样本方差:$\sigma_{ML}^2 = \frac{1}{N}\sum\limits_{n=1}^{N}(x_n - \mu_{ML})^2 \tag{1.56} $
高斯分布的最大似然函数最优解
--
\(\mathbb{E}\left[\mu_{M L}\right]=\mu \tag{1.57}\)
高斯分布的均值\mu是无偏估计
\(\mathbb{E}\left[\sigma_{M L}^{2}\right]=\left(\frac{N-1}{N}\right) \sigma^{2} \tag{1.58}\)
高斯分布的方差是有偏估计
\(\tilde\sigma^2 =\frac{N}{N-1}\sigma_{ML}^2=\frac{1}{N-1}\sum_{n=1}^{N}(x_n-\mu_{ML})^2 \tag{1.59}\)
这个方差参数估计是无偏的
\(Var(X_1+X_2+...+X_n)=\sum_\limits{k=1}^{n}Var(X_k)\)
\(若X_1,X_2,...,X_n相互独立,且都存在方差,则该式子成立\)
1.25 重新考察曲线拟合-从后验(MAP)角度看似然问题
什么是后验?就是从验证数据集 x,t角度看最大似然估计
\(N个输入\),x_train
\(N个输出\),y_train
\(1个输入,D是维度\),x_test
\(1个输出\),y_test
\(\beta\)
精度,\(\beta^{-1}=\sigma^2\)
\(通过已经拟合好的函数y,t服从高斯分布p(t\|x, w, \beta),注意x是给定的情况下,t是高斯分布\)
似然函数
对式1.60进行最大似然,发现最大化该似然函数等价于最小化误差函数
最大似然解
代入最大似然解后的预测分布
\(p(\boldsymbol{w} | \alpha)=\mathcal{N}\left(\boldsymbol{w} | \mathbf{0}, \alpha^{-1} \boldsymbol{I}\right)=\left(\frac{\alpha}{2 \pi}\right)^{\frac{M+1}{2}} \exp \left\{-\frac{\alpha}{2} \boldsymbol{w}^{T} \boldsymbol{w}\right\} \tag{1.65}\)
\(w\)的先验,假设是正态的(1.69中可以证明先验是正态的)
\(\alpha\)
\(w的先验分布的精度,是个超参数\)
\(p(w|X, T, \alpha, \beta) \propto p(T|X, w, \beta)p(w|\alpha) \tag{1.66}\)
证明见本博客
$ \frac{\beta}{2}\sum\limits_{n=1}^{N}{y(x_n, w) - t_n}^2 + \frac{\alpha}{2}w^Tw \tag{1.67} $
对1.66取log得到MAP公式,就是正则化函数
1.26 纯贝叶斯方法的曲线拟合
\(p(t|x, X, T) = \int p(t|x, w)p(w|X, T)dw \tag{1.68}\)
本章尝试对w积分,而不是像上一章对w做估计(或者说是仅利用加法法则和乘法法则求后验,而不是用贝叶斯定理求后验)
1.68求解结果
\(p(t|x, X, T) = \mathcal{N}(t|m(x), s^2(x)) \tag{1.69}\)
\(m(x)=\beta \phi(x)^{T} S \sum_{n=1}^{N} \phi\left(x_{n}\right) t_{n} \tag{1.70}\)
\(s^{2}(x)=\beta^{-1}+\phi(x)^{T} \boldsymbol{S} \phi(x) \tag{1.71}\)
\(S^{-1} = \alpha I + \beta\sum\limits_{n=1}^N\phi(x)\phi(x)^T \tag{1.72}\)
其中\(I\)是单位矩阵,定义向量\(\phi(x)\)为\(\phi_i(x) = x^i, i = 0,...,M\)
总结一下几个正态分布
1.预测分布-MAP
\(p(t|x, w_{ML}, \beta_{ML}) = \mathcal{N}(t|y(x, w_{ML}), \beta_{ML}^{-1}) \tag{1.64}\)
2.先验分布-MAP
\(p(\boldsymbol{w} | \alpha)=\mathcal{N}\left(\boldsymbol{w} | \mathbf{0}, \alpha^{-1} \boldsymbol{I}\right)=\left(\frac{\alpha}{2 \pi}\right)^{\frac{M+1}{2}} \exp \left\{-\frac{\alpha}{2} \boldsymbol{w}^{T} \boldsymbol{w}\right\} \tag{1.65}\)
3.先验分布
\(p(w|X,T)\sim \mathcal{N}(w|\mu,S) \tag{1.68中的推导可得}\)
\(S^{-1}=\alpha I+\beta \phi^T\phi,\mu = \beta S\phi^Tt_n\)
4.预测分布
\(p(t|x, X, T) = \mathcal{N}(t|m(x), s^2(x)) \tag{1.69}\)
贝叶斯回归的python实现
注意,在这段python代码中,实际没有用到上面的1.68-1.72的公式,实际是共轭先验的思想,对\(w\)假设是高斯分布,则其对应的后验也是高斯,推导的结果见公式3.48-3.51,最后样本拟合后得到的是\(p(w|X,T)\)这样一个正态分布,注意是一个分布,然后针对这个分布做随机采样,就能得到y_test
\(p(w) = \mathcal{N}(w|m_0,S_0) \tag{3.48}\)
\(p(w|t) = \mathcal{N}(w|m_N,S_N) \tag{3.49}\)
其中
\(\begin{eqnarray} m_N &=& S_N(S_0^{-1}m_0 + \beta\Phi^Tt) \tag{3.50} \\ S_N^{-1} &=& S_0^{-1} + \beta\Phi^T\Phi \tag{3.51} \end{eqnarray}\)
版本一
点击查看代码
import numpy as np
from prml.linear.regression import Regression
class BayesianRegression(Regression):
"""
贝叶斯回归模型
w ~ N(w|0, alpha^(-1)I)
y = X @ w
t ~ N(t|X @ w, beta^(-1))
"""
def __init__(self, alpha: float = 1., beta: float = 1.):
self.alpha = alpha
self.beta = beta
self.w_mean = None
self.w_precision = None
def _is_prior_defined(self) -> bool:
return self.w_mean is not None and self.w_precision is not None
def _get_prior(self, ndim: int) -> tuple: # -> tuple 表示返回一个tuple
'''
:param ndim:
:return: w_mean 先验的均值
w_precision 先验的精度
'''
if self._is_prior_defined():
return self.w_mean, self.w_precision
else:
return np.zeros(ndim), self.alpha * np.eye(ndim)
def fit(self, X: np.ndarray, t: np.ndarray):
"""
给定训练集,更新贝叶斯参数
Parameters
----------
X : (N, n_features) np.ndarray
训练集输入
t : (N,) np.ndarray
训练集输出
"""
# 先验的均值,先验的精度
mean_prev, precision_prev = self._get_prior(
np.size(X, 1)) # mean_prev =m_0 precision_prev =S_0= \alpha I 见PRML 3.48
w_precision = precision_prev + self.beta * X.T @ X # 这里是计算 w_precision=S_N^{-1}=S_0^{-1}+\beta X^T X,PRML 3.54
# 计算m(x) 也就是后验的均值
w_mean = np.linalg.solve(
w_precision,
precision_prev @ mean_prev + self.beta * X.T @ t
# w_mean=m_N ,解方程 S_N^{-1} w_mean =S_0^{-1} * m_0 +\beta X^T t ,PRML 3.53
)
self.w_mean = w_mean
self.w_precision = w_precision
self.w_cov = np.linalg.inv(self.w_precision) # w_cov = S_N , 协方差矩阵
def predict(self, X: np.ndarray, return_std: bool = False, sample_size: int = None):
"""
return mean (and standard deviation) of predictive distribution
Parameters
----------
X : (N, n_features) np.ndarray
independent variable
return_std : bool, optional
flag to return standard deviation (the default is False)
sample_size : int, optional
number of samples to draw from the predictive distribution
(the default is None, no sampling from the distribution)
Returns
-------
y : (N,) np.ndarray
mean of the predictive distribution
y_std : (N,) np.ndarray
standard deviation of the predictive distribution
y_sample : (N, sample_size) np.ndarray
samples from the predictive distribution
"""
# 得到拟合的正态分布后,对这个正态分布采样
if sample_size is not None:
w_sample = np.random.multivariate_normal(
self.w_mean, self.w_cov, size=sample_size
)
y_sample = X @ w_sample.T
return y_sample
y = X @ self.w_mean
if return_std:
y_var = 1 / self.beta + np.sum(X @ self.w_cov * X, axis=1)
y_std = np.sqrt(y_var)
return y, y_std
return y
点击查看代码
# 贝叶斯曲线拟合
model = BayesianRegression(alpha=2e-3, beta=2)
model.fit(X_train, y_train)
y, y_err = model.predict(X_test, return_std=True)
plt.scatter(x_train, y_train, facecolor="none", edgecolor="b", s=50, label="training data")
plt.plot(x_test, y_test, c="g", label="$\sin(2\pi x)$")
plt.plot(x_test, y, c="r", label="mean")
plt.fill_between(x_test, y - y_err, y + y_err, color="pink", label="std.", alpha=0.5)
plt.xlim(-0.1, 1.1)
plt.ylim(-1.5, 1.5)
plt.annotate("M=9", xy=(0.8, 1))
plt.legend(bbox_to_anchor=(1.05, 1.), loc=2, borderaxespad=0.)
plt.show()
版本二
点击查看代码
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
# %matplotlib inline
# 定义数据集:
def uniform(size):
x = np.linspace(0, 1, size)
return x.reshape(-1, 1)
def create_data(size):
x = uniform(size)
np.random.seed(42)
y = sin_fun(x) + np.random.normal(scale=0.15, size=x.shape)
return x, y
def sin_fun(x):
return np.sin(2 * np.pi * x)
x_train, y_train = create_data(10)
x_test = uniform(100)
y_test = sin_fun(x_test)
poly = PolynomialFeatures(9)
X_train = poly.fit_transform(x_train)
X_test = poly.fit_transform(x_test)
#### 定义贝叶斯估计方法:
class BayesianRegressor():
def __init__(self, alpha=1., beta=1.):
self.alpha = alpha
self.beta = beta
self.mean_prev = None
self.S = None
def fit(self, X, t):
S_inv = self.alpha * np.eye(np.size(X, 1)) + self.beta * np.matmul(X.T, X)
mean_prev = np.linalg.solve(
S_inv,
self.beta * np.matmul(X.T, t)
)
self.mean_prev = mean_prev
self.S = np.linalg.inv(S_inv)
def predict(self, X):
y = np.matmul(X, self.mean_prev)
y_var = 1 / self.beta + np.sum(np.matmul(X, self.S) * X, axis=1)
y_std = np.sqrt(y_var)
return y, y_std
# 最后使用我们的模型拟合观察数据,然后对新数据进行预测并绘制曲线:
model = BayesianRegressor(alpha=2e-3, beta=2)
y_train = y_train.reshape(10)
model.fit(X_train, y_train)
y, y_std = model.predict(X_test)
fig = plt.figure(figsize=(12, 8))
plt.scatter(x_train, y_train, facecolor="none", edgecolor="b", s=50, label="training data")
plt.plot(x_test, y_test, c="g", label="$\sin(2\pi x)$")
plt.plot(x_test, y, c="r", label="Mean")
plt.fill_between(x_test[:, 0], y - y_std, y + y_std, color="pink", label="std", alpha=0.5)
plt.title("M=9")
plt.legend(loc=2)
plt.show()
1.5.4 推断和决策
三种方法
1.生成模型:通过对每个类别\(C_k\),独立的确定类别的条件密度\(p(x | C_k)\)来解决推断问题,还分别推断出类别的先验概率\(p(C_k)\),然后使用贝叶斯定理:
\(p(C_k|x) = \frac{p(x|C_k)p(C_k)}{p(x)} \tag{1.82}\)
来计算类别的后验概率\(p(C_k|x)\)。
2.判别模型,解决确定类别的后验密度\(p(C_k|x)\)的推断问题,然后,使用决策论来对新的输入\(x\)进行分类。
3.判别函数:找到能直接把输入\(x\)映射到类别标签\(f(x)\)。
1.6 信息论
离散熵:如果箱子是X的离散状态\(x_i\),X的熵就是:
当所有的\(p(x_i)\)都相等,且值为p(x_i)=\frac{1}{M}时,熵取得最大值
微分熵
最大化微分熵的分布是高斯分布
高斯分布的微分熵
条件熵
条件熵的乘积规则
KL散度(相对熵),KL散度不是对称的,$\mathrm{KL}(p | q) \ne \mathrm{KL}(q | p) $
互信息
利用概率加和和乘积规则