吴恩达机器学习课后作业01——ex1(单变量线性回归和多变量线性回归)
Programming Exercise 1: Linear Regression
大致说明:
假设你是一家连锁餐厅的首席执行官,正在考虑在不同的城市开设一家新的分店。你已经从各个城市的人口和卡车中获得了数据,需要预测人口和利润之间的联系。
根据ex1data1.txt(第一列是城市人口,第二列是对应的利润,其中负值代表着亏损)、ex1data2.txt(多变量使用到的数据)中的数据,进行线性拟合
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
2 Linear regression with one variable
datafile = 'data/ex1data1.txt'
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1),unpack=True) #读数据,分两列存储,用向量保存
#Form the usual "X" matrix and "y" vector
X = np.transpose(np.array(cols[:-1])) # cols[:-1] 取第一个向量(1,97),即人口数据。最终转换为(97,1)
y = np.transpose(np.array(cols[-1:])) # cols[-1:] 取第二个向量,即利润数据
m = y.size # number of training examples
2.1 Plotting the Data
#绘制数据
plt.plot(X[:,0],y[:,0],'ro',markersize=5) # 设置红圆图样,大小为5
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
plt.title('POPULATION AND PROFIT')
2.2 Gradient Descent
iterations = 1500 # 最大迭代次数
alpha = 0.01 # 初始学习率为0.01
X = np.insert(X,0,1,axis=1) # 向参数矩阵X中增加一个维度,用来表示θ0(theta_0) 的参数,即增加全为1的常数列
线性回归算法优化的目标是:选取最有可能与数据相拟合的直线。数据与直线的误差,称为建模误差 modeling error。为了使建模误差最小,我们需要调整参数θ0 和 θ1,使得代价函数Cost function J(θ0, θ1)的值最小。
在各种代价函数中,最常用的是平方误差代价函数 Squared error cost function。
"""
theta是一个n维列向量
X是一个m行n列的矩阵(单变量情况下,是两列(包含一个变量和theta_0))
y是一个m行1列的矩阵(m维的列向量)
"""
def h(X,theta): # 计算线性假设函数h(x)
return np.dot(X,theta) #得到h(x)结果
def computeCost(X,mytheta,y): # 定义代价函数
return float((1./(2*m)) * np.dot((h(X,mytheta)-y).T,(h(X,mytheta)-y)))#利用转置与自身的乘积,实现差的平方累加
#测试:theta参数初始为0(h(x)=0),代价结果应该为32.07;注意:*.shape 打印(行数,列数)
initial_theta = np.zeros((X.shape[1],1)) # 将theta参数初始化为0,即initial_theta是一个n维的列向量(维数与X的特征数有关,这里是二维列向量)
print(computeCost(X,initial_theta,y))# 打印测试的平均均方代价结果,正确的平均均方代价:32.07
32.07273387745567
#定义梯度下降函数:随着梯度下降的每一步,参数theta_j都会接近最优值(最小值),从而达到损失最低的J(θ)
def descendGradient(X, theta_start):
"""
theta_start:初始的参数,为0
X:m行n列的矩阵(m是样本数量,n指特征数,这里为2)
"""
theta = theta_start
costVector = [] #记录代价的变化
thetahistory = [] #记录参数的变化
for meaninglessvariable in range(iterations): #默认步长为1,循环iterations次(迭代执行的次数)
tmptheta = theta
# 记录每次迭代时,当前假设函数的代价
costVector.append(computeCost(X,theta,y))
# 记录每次参数迭代的变化
thetahistory.append(list(theta[:,0]))
# 遍历每个参数,分别进行梯度下降(simultaneously update θj for all j)
for j in range(len(tmptheta)): # θ_0 和 θ_1
tmptheta[j] = theta[j] - (alpha/m)*np.sum((h(X,theta) - y)*np.array(X[:,j]).reshape(m,1))
theta = tmptheta # 更新当前参数
return theta, thetahistory, costVector
#执行梯度下降,寻找使假设函数代价最小的最优参数
initial_theta = np.zeros((X.shape[1],1)) # X是m行n列的矩阵,所以参数是n行1列
theta, thetahistory, costVector = descendGradient(X,initial_theta)
#定义绘制代价变化的函数曲线
def plotCostByStep(costVector):
plt.plot(range(len(costVector)),costVector,'b-')
plt.title("Cost of each step")
plt.xlabel("Iteration number")
plt.ylabel("Cost Value")
plt.xlim([-0.05*iterations,1.05*iterations])
plt.ylim([4,7])
plt.show()
#定义参数变化的函数曲线
def plotThetaByStep(thetahistory):
data = np.array(thetahistory)
thetahistory_0 = data[:,0]
thetahistory_1 = data[:,1]
plt.plot(range(thetahistory_0.size),thetahistory_0,'g-',label="Theta_0")
plt.plot(range(thetahistory_1.size),thetahistory_1,'r-',label="Theta_population")
plt.legend()
plt.title("Thetas of each step")
plt.xlabel("Iteration number")
plt.ylabel("Theta Value")
plotCostByStep(costVector)
plotThetaByStep(thetahistory)
#返回预测函数的预测值(假设函数)
def myfit(xval):
return theta[0] + theta[1]*xval
plt.plot(X[:,1],y[:,0],'ro',markersize=5,label='Training Data')#绘制点图
plt.plot(X[:,1],myfit(X[:,1]),'b-',label = 'Hypothesis: h(x) = %0.2f + %0.2fx'%(theta[0],theta[1]))
plt.grid(True) #网格
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
plt.legend() #显示图例
2.4 Visualizing J($\theta$)
#导入matplotlib的三维必要包
from mpl_toolkits.mplot3d import axes3d, Axes3D
import itertools
plt.figure(figsize=(10,10))
ax = plt.axes(projection='3d')
xvals = np.arange(-5,5,.3)
yvals = np.arange(-1,4,.1)
myxs, myys, myzs = [], [], []
for david in xvals:
for kaleko in yvals:
myxs.append(david)
myys.append(kaleko)
myzs.append(computeCost(X,np.array([[david], [kaleko]]),y))
scat = ax.scatter(myxs,myys,myzs,c=np.abs(myzs),cmap=plt.get_cmap('YlOrRd'))
plt.xlabel(r'$\theta_0$',fontsize=20)
plt.ylabel(r'$\theta_1$',fontsize=20)
plt.title('Cost (Minimization Path Shown in Blue)',fontsize=30)
plt.plot([x[0] for x in thetahistory],[x[1] for x in thetahistory],costVector,'b-')
3. Linear Regression with multiple variables
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
datafile = 'data/ex1data2.txt'#房价训练集:第一列为房子的大小,第二列为卧室数量,第三列为房子的价格
#读取原始数据
cols = np.loadtxt(datafile,delimiter=',',usecols=(0,1,2),unpack=True) #对数据进行分隔分组,分为三组
#对数据进行分组处理:参数矩阵和y值向量
X = np.transpose(np.array(cols[:-1])) #取前两租(房子大小、卧室数量)作为参数矩阵(特征矩阵)
y = np.transpose(np.array(cols[-1:])) #取最后一组,即y值
m = y.size # 训练集大小
#在X参数矩阵中,插入回归表达式中常数项的参数theta_0的系数
X = np.insert(X,0,1,axis=1)
#可视化数据,判断是否需要特征缩放(feature normalization)
def plotFeatureCounts(x,label):
plt.hist(x,bins=20)
plt.title(label)
plt.xlabel('Column Value')
plt.ylabel('Counts')
plt.show()
plotFeatureCounts(X[:,0],"Theta_0")
plotFeatureCounts(X[:,1],"The size of the house")
plotFeatureCounts(X[:,2],"The number of bedrooms")
#进行特征缩放(采用Z-score归一化方法)
stored_feature_means, stored_feature_stds = [], []
Xcopy = X.copy()
for icol in range(Xcopy.shape[1]):
stored_feature_means.append(np.mean(Xcopy[:,icol]))#存放各个特征的均值
stored_feature_stds.append(np.std(Xcopy[:,icol]))#存放各个特征的标准差
#theta_0参数无须特征缩放
if not icol: continue
#对各个特征进行归一化
Xcopy[:,icol] = (Xcopy[:,icol] - stored_feature_means[-1])/stored_feature_stds[-1]
#可视化特征缩放后的数据
plt.xlim([-5,5])
plt.hist(Xcopy[:,0],label = 'Theta_0')
plt.hist(Xcopy[:,1],label = 'Theta_size')
plt.hist(Xcopy[:,2],label = 'Theta_bedrooms')
plt.title('Feature Normalization Accomplished')
plt.xlabel('Column Value')
plt.ylabel('Counts')
plt.legend()
#执行多变量的梯度下降,初始参数置0
initial_theta = np.zeros((Xcopy.shape[1],1))
theta, thetahistory, costVector = descendGradient(Xcopy,initial_theta)
def plotThetaMultiByStep(thetahistory):
data = np.array(thetahistory)
thetahistory_0 = data[:,0]
thetahistory_1 = data[:,1]
thetahistory_2 = data[:,2]
plt.plot(range(thetahistory_0.size),thetahistory_0,'g-',label="Theta_0")
plt.plot(range(thetahistory_1.size),thetahistory_1,'r-',label="Theta_size")
plt.plot(range(thetahistory_2.size),thetahistory_2,'b-',label="Theta_bedrooms")
plt.legend()
plt.title("Thetas of each step")
plt.xlabel("Iteration number")
plt.ylabel("Theta Value")
def plotCostMultiByStep(costVector):
plt.plot(range(len(costVector)),costVector,'b-')
plt.title("Cost of each step")
plt.xlabel("Iteration number")
plt.ylabel("Cost Value")
plt.show()
print(theta) #打印最终参数
#绘制 代价变化和参数变化 图
plotCostMultiByStep(costVector)
plotThetaMultiByStep(thetahistory)
[[340412.56301439]
[109371.67272252]
[ -6502.3992545 ]]
#进行测试
print("Check of result: What is price of house with 1650 square feet and 3 bedrooms?")
ytest = np.array([1650.,3.])
ytestscaled = [(ytest[x]-stored_feature_means[x+1])/stored_feature_stds[x+1] for x in range(len(ytest))]
ytestscaled.insert(0,1) # 向测试数据中,插入theta_0的参数:1
print("$%0.2f" % float(h(ytestscaled,theta))) # 计算回归预测结果
Check of result: What is price of house with 1650 square feet and 3 bedrooms?
$293098.15
#正规方程法
from numpy.linalg import inv
#正规方程求解参数
def normEqtn(X,y):
return np.dot(np.dot(inv(np.dot(X.T,X)),X.T),y)
print("Normal equation prediction for price of house with 1650 square feet and 3 bedrooms")
print("$%0.2f" % float(h([1,1650.,3],normEqtn(X,y))))
Normal equation prediction for price of house with 1650 square feet and 3 bedrooms
$293081.46