线性回归分析

import  matplotlib.pyplot  as plt
import pandas as pd
#数据来源 http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv
data=pd.read_csv('./Advertising.csv')
print(data.head())
 
   Unnamed: 0     TV  Radio  Newspaper  Sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
3           4  151.5   41.3       58.5   18.5
4           5  180.8   10.8       58.4   12.9
In [3]:
x=data[['TV','Radio','Newspaper']]
y=data['Sales']
In [4]:
# 绘制1
plt.plot(data['TV'], y, 'ro', label='TV')
plt.plot(data['Radio'], y, 'g^', label='Radio')
plt.plot(data['Newspaper'], y, 'b*', label='Newspaer') # plt.legend(loc='lower right')
plt.grid()
plt.show()
 
In [5]:
plt.figure(figsize=(9,12)) 
plt.subplot(311) 
plt.plot(data['TV'], y, 'ro') 
plt.title('TV')
plt.grid()
plt.subplot(312) 
plt.plot(data['Radio'], y, 'g^') 
plt.title('Radio')
plt.grid()
plt.subplot(313) 
plt.plot(data['Newspaper'], y, 'b*') 
plt.title('Newspaper')
plt.grid()
plt.tight_layout()
plt.show()
 
In [6]:
#构建特征向量
feature_cols = ['TV', 'Radio', 'Newspaper']
#特征向量数据
X=data[feature_cols]
#目标结果
y = data['Sales']
In [8]:
#训练集 与 测试集
from sklearn.cross_validation import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print X_train.shape,X_test.shape,y_train.shape,y_test.shape
 
(150, 3) (50, 3) (150,) (50,)
In [9]:
#根据图目测貌似有线性相关性
from sklearn.linear_model import LinearRegression
linereg=LinearRegression()
model=linereg.fit(X_train,y_train)
 
/Library/Python/2.7/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)
In [10]:
print model
 
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [11]:
linereg.intercept_
Out[11]:
2.8769666223179371
In [12]:
linereg.coef_
Out[12]:
array([ 0.04656457,  0.17915812,  0.00345046])
In [13]:
zip(feature_cols,linereg.coef_)
Out[13]:
[('TV', 0.046564567874150253),
 ('Radio', 0.17915812245088836),
 ('Newspaper', 0.0034504647111804204)]
In [14]:
y_predict=linereg.predict(X_test)
In [16]:
y_predict
Out[16]:
array([ 21.70910292,  16.41055243,   7.60955058,  17.80769552,
        18.6146359 ,  23.83573998,  16.32488681,  13.43225536,
         9.17173403,  17.333853  ,  14.44479482,   9.83511973,
        17.18797614,  16.73086831,  15.05529391,  15.61434433,
        12.42541574,  17.17716376,  11.08827566,  18.00537501,
         9.28438889,  12.98458458,   8.79950614,  10.42382499,
        11.3846456 ,  14.98082512,   9.78853268,  19.39643187,
        18.18099936,  17.12807566,  21.54670213,  14.69809481,
        16.24641438,  12.32114579,  19.92422501,  15.32498602,
        13.88726522,  10.03162255,  20.93105915,   7.44936831,
         3.64695761,   7.22020178,   5.9962782 ,  18.43381853,
         8.39408045,  14.08371047,  15.02195699,  20.35836418,
        20.57036347,  19.60636679])
In [18]:
y_test
Out[18]:
58     23.8
40     16.6
34      9.5
102    14.8
184    17.6
198    25.5
95     16.9
4      12.9
29     10.5
168    17.1
171    14.5
18     11.3
11     17.4
89     16.7
110    13.4
118    15.9
159    12.9
35     12.8
136     9.5
59     18.4
51     10.7
16     12.5
44      8.5
94     11.5
31     11.9
162    14.9
38     10.1
28     18.9
193    19.6
27     15.9
47     23.2
165    11.9
194    17.3
177    11.7
176    20.2
97     15.5
174    11.5
73     11.0
69     22.3
172     7.6
108     5.3
107     8.7
189     6.7
14     19.0
56      5.5
19     14.6
114    14.6
39     21.5
185    22.6
124    19.7
Name: Sales, dtype: float64
In [19]:
for i in range(len(y_predict)):
    sum_mean+=(y_predict[i]-y_test.values[i])**2
In [20]:
sum_mean
Out[20]:
98.652281011416832
In [22]:
np.sqrt(sum_mean/len(y_predict))
Out[22]:
1.4046514230328948
In [24]:
plt.plot(range(len(y_predict)),y_predict,'b',label="predict")
plt.plot(range(len(y_predict)),y_test,'r',label="test")
Out[24]:
[<matplotlib.lines.Line2D at 0x10f8960d0>]
In [25]:
plt.legend(loc="upper right")
plt.xlabel("the number of sales")
plt.ylabel('value of sales')
plt.show()
 
In [26]:
#根据目测Newspaper 与sales的多少并没有什么联系同时最后的参数('Newspaper', 0.0034504647111804204)] 也是及其小
#因此可以尝试移除Newspaper这个特征值
In [27]:
x=data[['TV','Radio']]
In [28]:
X_train,X_test, y_train, y_test = train_test_split(x, y, random_state=1)
In [29]:
linereg=LinearRegression()
model=linereg.fit(X_train,y_train)
In [30]:
model.intercept_
Out[30]:
2.9272373202664852
In [31]:
model.coef_
Out[31]:
array([ 0.04660234,  0.18117959])
In [34]:
y_predict=linereg.predict(X_test)
In [35]:
y_predict
Out[35]:
array([ 21.73751851,  16.40451622,   7.64073276,  17.81512707,
        18.6140367 ,  23.75320401,  16.26267467,  13.30968011,
         9.11623605,  17.24121988,  14.37997584,   9.86630093,
        17.28107008,  16.70455883,  14.93571851,  15.47067849,
        12.39847009,  17.21737409,  11.18626133,  18.09114847,
         9.34543641,  12.71804909,   8.75327159,  10.468843  ,
        11.34116649,  14.98646893,   9.77329331,  19.43186663,
        18.31005062,  17.14215851,  21.62609193,  14.47149683,
        16.3536012 ,  12.27215653,  19.97488243,  15.34878155,
        13.90760851,   9.99030388,  20.98440888,   7.482353  ,
         3.61019982,   7.1944428 ,   5.99097416,  18.39958364,
         8.35858094,  14.12195436,  15.05074527,  20.38304162,
        20.65191677,  19.47457534])
In [36]:
sum_mean=0
for i in range(len(y_predict)):
    sum_mean+=(y_predict[i]-y_test.values[i])**2
In [37]:
np.sqrt(sum_mean/len(y_predict))
Out[37]:
1.3879034699382888
In [38]:
RMES=np.sqrt(sum_mean/len(y_predict))
In [39]:
#RMES:均方根误差(Root Mean Squared Error, RMSE) 越小越接近
plt.plot(range(len(y_predict)),y_predict,'b',label="predict")
plt.plot(range(len(y_predict)),y_test,'r',label="test")
plt.legend(loc="upper right")
plt.xlabel("the number of sales")
plt.ylabel('value of sales')
plt.show()
 
In [ ]:
#总结
'''
我们在将 Newspaper 这个特征移除之后,得到 RMSE 变小了,
说明 Newspaper 特征可能不适合作为预测销量的特征,于是,我们得到了新的模型。
我们还可以通过不同的特
征组合得到新的模型,看看最终的误差是如何的。
在机器学习中有“奥卡姆剃刀”的原理,即:如果能够用简单模型解决问题, 则不使用更为复杂的模型。
因为复杂模型往往增加了不确定性,造成过多的人力 和物力成本,且容易过拟合。
'''
posted @ 2017-02-04 17:57  similarface  阅读(847)  评论(0编辑  收藏  举报