复制代码
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
data = np.loadtxt('ccpp.csv',delimiter=',',skiprows=(1),unpack=True).T
data = np.matrix(data)
X = data[:,0:4]
y = data[:,4]
X_norm = np.array(X)
mu = np.zeros((1,X.shape[1]))   
sigma = np.zeros((1,X.shape[1]))
mu = np.mean(X_norm,0)          # 求每一列的平均值(0指定为列,1代表行)
sigma = np.std(X_norm,0)        # 求每一列的标准差
for i in range(X.shape[1]):     # 遍历列
    X_norm[:,i] = (X_norm[:,i]-mu[i])/sigma[i]  # 归一化
# print(X_norm[:,1]-mu[1])
# print(X_norm[:,1])
X = X_norm
m,n = X.shape
X = np.hstack((np.ones((m,1)),X))
X.shape
# print(X)

##划分数据集变成训练集和测试集
#我们把X和y的样本组合划分成两部分,一部分是训练集,一部分是测试集,代码如下:

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

ss = np.ones(len(X_train)-6062)

w = np.matrix([[1],[1],[1],[1],[1]])
for i in np.arange(0,len(X_train)-6062):
    t = X_train[i]
    t=np.matrix(t)
    yt = y_train[i]
    yt=np.matrix(yt)
    l = t.T*(yt-t*w)
    ws = 0.01*l
    w = w+ws
    y_tt = y_test*1
    for t in np.arange(0,len(X_test)):
        y_tt[t] = (X_test[t]*w)
    ss[i] = metrics.mean_squared_error(y_test, y_tt)

print(w)

y_tt = y_test*1
for i in np.arange(0,len(X_test)):
    y_tt[i] = (X_test[i]*w)

re = np.array(y_test.T)
res = np.array(y_tt.T)

fig, ax = plt.subplots()
ax.scatter(re[0],res[0])
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
复制代码

数据集: http://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant