【大数据技术能力提升_3】线性回归学习
标记:
n:特征数
\(x^{(i)}\):第i个样本
\(x^{(i)}_{j}\):第i个样本的第j个特征
假设:\(h_\theta(x)=\theta^Tx=\theta_0+\theta_1x_1+\theta_2x_2+\cdots+\theta_nx_n\)
参数:\(\theta_0,\theta_1,\theta_2,\cdots,\theta_n\)
代价函数:\(J(\theta_0,\theta_1,\cdots,\theta_n)=\frac{1}{2m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})^2\)
梯度下降:
垂直到收敛{
\(\theta_j:=\theta_j- \alpha\frac{\partial }{\partial \theta_j}J(\theta_0,\theta_1,\theta_2,\cdots,\theta_n)\)
}
同步对所有\(j(0,\cdots,n)\)同步更新
当n=1时
垂直到收敛{
\(\theta_0:=\theta_0- \alpha\frac{1}{m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})\)
\(\theta_1:=\theta_1- \alpha\frac{1}{m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})\cdot x^{(i)}\)
}
当n>=1时
垂直到收敛{
\(\theta_j:=\theta_j- \alpha\frac{1}{m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})\cdot x^{(i)}_j\)
}
特征缩放
目的:可以加速梯度下降,减少迭代次数
将所有特征都放缩到大约\(-1\le x_i \le 1\)
均值标准化
\(x_i-\mu_i\)使得特征具有近似0的均值(\(x_0\)除外)
学习率
\(\theta_j:=\theta_j- \alpha\frac{\partial }{\partial \theta_j}J(\theta)\)
如何确认梯度下降正确工作?
随着迭代轮次的次数增多,\(J(\theta)\)的变化幅度非常小的时候,\(J(\theta)\)可以认为是最优的(即\(minJ(\theta)\))
如何选择学习率\(\theta\)?
若\(\alpha\)足够小,\(J(\theta)\)应当一直减小;但如果太小,则收敛速度太慢;但如果太大,则会出现偏离最小值,出现随着迭代轮次的次数增多,\(J(\theta)\)的变化幅度越来越大
选择\(\alpha\),尝试:\(\cdots 0.0001,0.0003,0.01,0.03,0.3,1,\cdots\)
复合特征
\(h_\theta(x)=\theta_0+\theta_1 \times width + \theta_2x_2 \times height\) 不如直接用面积作为特征效果好
正规方程
梯度下降:迭代求解参数\(\theta\)
正规方程:解析求解参数\(\theta\)(利用求导)
正规方程的公式:\(\theta=(X^TX)^{-1}X^Ty\)
梯度下降 VS 正规方程
梯度下降:需要选择学习率和迭代;但即使n很大也能很好的工作
正规方程:不需要选择学习率和迭代;但需要计算逆,以及n很大时计算速度会很慢
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False
def loadDataSet(filename):
X = []
E = []#Y = []
with open(filename, 'rb') as f:
for idx, line in enumerate(f): #enumerate:枚举
if idx < 1:
title = line.decode('utf-8').strip()
continue
line = line.decode('utf-8').strip() #str.strip([chars]);移除字符串两边的空格或换行符
if not line:
continue
eles = line.split() #分割后,返回列表
if idx ==0:
numFea = len(eles)
eles = map(float, eles) #map(function, iterable, ...)function:函数,iterable:一个或多个序列
X.append(eles[:1])
E.append(eles[1:])#Y.append([eles[-1]])
return np.array(X), np.array(E), title
def h(theta, X):
return np.dot(X, theta)
def J(theta, X, Y):
m= len(X)
return np.sum(np.dot((h(theta, X)-Y).T, (h(theta, X)-Y)))/(2 * m)
def bgd(alpha, maxloop, epsilon, X, Y):
m,n = X.shape #m样本数, n是特征数
theta = np.zeros((2,1))
count = 0 #记录迭代次数
converged = False #是否已经收敛的标志
error = np.inf #当前得代价函数值
errors = []
thetas = {0:[theta[0,0]],1:[theta[1,0]]}
while count < maxloop:
if(converged):
break
count = count + 1
temp1 = theta[0,0] - alpha / m * (h(theta, X) - Y).sum()
temp2 = theta[1,0] - alpha / m * (np.dot(X[:,1][:,np.newaxis].T, (h(theta, X) - Y))).sum() #创建新的矩阵数轴:列[:,np.newaxis],行[np.newaxis,:]
#同步更新
theta[0, 0] = temp1
theta[1, 0] = temp2
thetas[0].append(temp1)
thetas[1].append(temp2)
error = J(theta, X, Y)
errors.append(error)
if(error < epsilon):
converged = True
return theta,errors,thetas
X, E, title = loadDataSet('test_1')#X, Y = loadDataSet('test_1')
print X.shape,X #shape返回矩阵的长度,如m*n的矩阵
print E
m, n = X.shape
X = np.concatenate((np.ones((m, 1)), X), axis=1) #concatenate中axis=1时,表示对应行的数组进行拼接;axis=0,依次拼接
X.shape
for i in range(X.shape[0]):#range(X.shape[0]):
alpha = 0.001 #学习率
maxloop = 20000 #最大迭代次数
epsilon = 0.1 #收敛判断条件
E1 = E[:, i:i+1]
print E1,X
result = bgd(alpha, maxloop, epsilon, X, E1)#result = bgd(alpha, maxloop, epsilon, X, Y)
theta,errors,thetas = result
xCopy = X.copy()
xCopy.sort(0)
yHat = h(theta, xCopy)
xCopy[:,1].shape, yHat.shape, theta.shape
#绘制回归直线
plt.xlabel(u'收入')
plt.ylabel(u'月份')
plt.plot(xCopy[:,1], yHat,color='r')
plt.scatter(X[:,1].flatten(), E1.T.flatten())
plt.show()
xCopy = X.copy()
xCopy.sort(0)
yHat = h(theta, xCopy)
xCopy[:,1].shape, yHat.shape, theta.shape
((6L,), (6L, 1L), (2L, 1L))
#绘制回归直线
plt.xlabel(u'房屋面积')
plt.ylabel(u'房屋价格')
plt.plot(xCopy[:,1], yHat,color='r')
plt.scatter(X[:,1].flatten(), Y.T.flatten())
plt.show()
plt.xlim(1,1600)
plt.ylim(3,100)
plt.xlabel(u'迭代次数')
plt.ylabel(u'代价函数J')
plt.plot(range(len(errors)), errors)
plt.show()
#准备网格数据,以备画梯度下降过程图
%matplotlib
from mpl_toolkits.mplot3d import axes3d
size = 100
theta0Vals = np.linspace(-10,10, size)
theta1Vals = np.linspace(-2,4, size)
JVals = np.zeros((size, size))
for i in range(size):
for j in range(size):
col = np.matrix([[theta0Vals[i]], [theta1Vals[j]]])
JVals[i,j] = J(col ,X ,Y)
theta0Vals, theta1Vals = np.meshgrid(theta0Vals, theta1Vals)
JVals = JVals.T
Using matplotlib backend: Qt5Agg
#绘制3D代价函数图形
contourSurf = plt.figure()
ax = contourSurf.gca(projection='3d')
ax.plot_surface(theta0Vals, theta1Vals, JVals, rstride=2, cstride=2, alpha=0.3,
cmap=matplotlib.cm.rainbow, linewidth=0, antialiased=False)
ax.plot(theta[0], theta[1], 'rx')
ax.set_xlabel(r'$\theta_0$')
ax.set_ylabel(r'$\theta_1$')
ax.set_zlabel(r'$J(\theta)$')
#绘制代价函数等高线图
%matplotlib inline
plt.figure(figsize=(12,6))
CS = plt.contour(theta0Vals, theta1Vals, JVals, np.logspace(-5,6,30))
plt.clabel(CS ,inline=1, fontsize=10)
#绘制最优解
plt.plot(theta[0,0], theta[1,0], 'rx', markersize=10, linewidth=3)
#绘制梯度下降过程
plt.plot(thetas[0], thetas[1], 'rx', markersize=3, linewidth=1)
plt.plot(thetas[0], thetas[1], 'r-', markersize=3, linewidth=1)
plt.xlabel(r'$\theta_0$')
plt.ylabel(r'$\theta_1$')
study just for life!