线性回归分析

import  matplotlib.pyplot  as plt
import pandas as pd
#数据来源 http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv
data=pd.read_csv('./Advertising.csv')
print(data.head())

   Unnamed: 0     TV  Radio  Newspaper  Sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
3           4  151.5   41.3       58.5   18.5
4           5  180.8   10.8       58.4   12.9

In [3]:

x=data[['TV','Radio','Newspaper']]
y=data['Sales']

In [4]:

# 绘制1
plt.plot(data['TV'], y, 'ro', label='TV')
plt.plot(data['Radio'], y, 'g^', label='Radio')
plt.plot(data['Newspaper'], y, 'b*', label='Newspaer') # plt.legend(loc='lower right')
plt.grid()
plt.show()

In [5]:

plt.figure(figsize=(9,12)) 
plt.subplot(311) 
plt.plot(data['TV'], y, 'ro') 
plt.title('TV')
plt.grid()
plt.subplot(312) 
plt.plot(data['Radio'], y, 'g^') 
plt.title('Radio')
plt.grid()
plt.subplot(313) 
plt.plot(data['Newspaper'], y, 'b*') 
plt.title('Newspaper')
plt.grid()
plt.tight_layout()
plt.show()

In [6]:

#构建特征向量
feature_cols = ['TV', 'Radio', 'Newspaper']
#特征向量数据
X=data[feature_cols]
#目标结果
y = data['Sales']

In [8]:

#训练集 与 测试集
from sklearn.cross_validation import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print X_train.shape,X_test.shape,y_train.shape,y_test.shape

(150, 3) (50, 3) (150,) (50,)

In [9]:

#根据图目测貌似有线性相关性
from sklearn.linear_model import LinearRegression
linereg=LinearRegression()
model=linereg.fit(X_train,y_train)

/Library/Python/2.7/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

In [10]:

print model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:

linereg.intercept_

Out[11]:

2.8769666223179371

In [12]:

linereg.coef_

Out[12]:

array([ 0.04656457,  0.17915812,  0.00345046])

In [13]:

zip(feature_cols,linereg.coef_)

Out[13]:

[('TV', 0.046564567874150253),
 ('Radio', 0.17915812245088836),
 ('Newspaper', 0.0034504647111804204)]

In [14]:

y_predict=linereg.predict(X_test)

In [16]:

y_predict

Out[16]:

array([ 21.70910292,  16.41055243,   7.60955058,  17.80769552,
        18.6146359 ,  23.83573998,  16.32488681,  13.43225536,
         9.17173403,  17.333853  ,  14.44479482,   9.83511973,
        17.18797614,  16.73086831,  15.05529391,  15.61434433,
        12.42541574,  17.17716376,  11.08827566,  18.00537501,
         9.28438889,  12.98458458,   8.79950614,  10.42382499,
        11.3846456 ,  14.98082512,   9.78853268,  19.39643187,
        18.18099936,  17.12807566,  21.54670213,  14.69809481,
        16.24641438,  12.32114579,  19.92422501,  15.32498602,
        13.88726522,  10.03162255,  20.93105915,   7.44936831,
         3.64695761,   7.22020178,   5.9962782 ,  18.43381853,
         8.39408045,  14.08371047,  15.02195699,  20.35836418,
        20.57036347,  19.60636679])

In [18]:

y_test

Out[18]:

58     23.8
40     16.6
34      9.5
102    14.8
184    17.6
198    25.5
95     16.9
4      12.9
29     10.5
168    17.1
171    14.5
18     11.3
11     17.4
89     16.7
110    13.4
118    15.9
159    12.9
35     12.8
136     9.5
59     18.4
51     10.7
16     12.5
44      8.5
94     11.5
31     11.9
162    14.9
38     10.1
28     18.9
193    19.6
27     15.9
47     23.2
165    11.9
194    17.3
177    11.7
176    20.2
97     15.5
174    11.5
73     11.0
69     22.3
172     7.6
108     5.3
107     8.7
189     6.7
14     19.0
56      5.5
19     14.6
114    14.6
39     21.5
185    22.6
124    19.7
Name: Sales, dtype: float64

In [19]:

for i in range(len(y_predict)):
    sum_mean+=(y_predict[i]-y_test.values[i])**2

In [20]:

sum_mean

Out[20]:

98.652281011416832

In [22]:

np.sqrt(sum_mean/len(y_predict))

Out[22]:

1.4046514230328948

In [24]:

plt.plot(range(len(y_predict)),y_predict,'b',label="predict")
plt.plot(range(len(y_predict)),y_test,'r',label="test")

Out[24]:

[<matplotlib.lines.Line2D at 0x10f8960d0>]

In [25]:

plt.legend(loc="upper right")
plt.xlabel("the number of sales")
plt.ylabel('value of sales')
plt.show()

In [26]:

#根据目测Newspaper 与sales的多少并没有什么联系同时最后的参数('Newspaper', 0.0034504647111804204)] 也是及其小
#因此可以尝试移除Newspaper这个特征值

In [27]:

x=data[['TV','Radio']]

In [28]:

X_train,X_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [29]:

linereg=LinearRegression()
model=linereg.fit(X_train,y_train)

In [30]:

model.intercept_

Out[30]:

2.9272373202664852

In [31]:

model.coef_

Out[31]:

array([ 0.04660234,  0.18117959])

In [34]:

y_predict=linereg.predict(X_test)

In [35]:

y_predict

Out[35]:

array([ 21.73751851,  16.40451622,   7.64073276,  17.81512707,
        18.6140367 ,  23.75320401,  16.26267467,  13.30968011,
         9.11623605,  17.24121988,  14.37997584,   9.86630093,
        17.28107008,  16.70455883,  14.93571851,  15.47067849,
        12.39847009,  17.21737409,  11.18626133,  18.09114847,
         9.34543641,  12.71804909,   8.75327159,  10.468843  ,
        11.34116649,  14.98646893,   9.77329331,  19.43186663,
        18.31005062,  17.14215851,  21.62609193,  14.47149683,
        16.3536012 ,  12.27215653,  19.97488243,  15.34878155,
        13.90760851,   9.99030388,  20.98440888,   7.482353  ,
         3.61019982,   7.1944428 ,   5.99097416,  18.39958364,
         8.35858094,  14.12195436,  15.05074527,  20.38304162,
        20.65191677,  19.47457534])

In [36]:

sum_mean=0
for i in range(len(y_predict)):
    sum_mean+=(y_predict[i]-y_test.values[i])**2

In [37]:

np.sqrt(sum_mean/len(y_predict))

Out[37]:

1.3879034699382888

In [38]:

RMES=np.sqrt(sum_mean/len(y_predict))

In [39]:

#RMES:均方根误差(Root Mean Squared Error, RMSE) 越小越接近
plt.plot(range(len(y_predict)),y_predict,'b',label="predict")
plt.plot(range(len(y_predict)),y_test,'r',label="test")
plt.legend(loc="upper right")
plt.xlabel("the number of sales")
plt.ylabel('value of sales')
plt.show()

In [ ]:

#总结
'''
我们在将 Newspaper 这个特征移除之后,得到 RMSE 变小了,
说明 Newspaper 特征可能不适合作为预测销量的特征,于是,我们得到了新的模型。
我们还可以通过不同的特
征组合得到新的模型,看看最终的误差是如何的。
在机器学习中有“奥卡姆剃刀”的原理,即:如果能够用简单模型解决问题, 则不使用更为复杂的模型。
因为复杂模型往往增加了不确定性,造成过多的人力 和物力成本,且容易过拟合。
'''

posted @ 2017-02-04 17:57 similarface 阅读(847) 评论(0) 编辑收藏举报

刷新页面返回顶部

similarface

线性回归分析

公告