一.数据产生

 1 from sklearn.datasets import make_classification, make_blobs
 2 from matplotlib.colors import ListedColormap
 3 from sklearn.datasets import load_breast_cancer
 4 from adspy_shared_utilities import load_crime_dataset
 5 
 6 cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])
 7 
 8 #make_regression:随机产生回归模型的数据
 9 #参数:n_samples : 数据个数
10 #n_features:数据中变量个数
11 #n_informative:有关变量个数
12 #bias:线性模型中的偏差项
13 #noise:高斯分布的标准差
14 #random_state:随机数的种子生成器
15 
16 # 简单(一个参数)的回归数据
17 from sklearn.datasets import make_regression
18 plt.figure()
19 plt.title('Sample regression problem with one input variable')
20 X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,
21                             n_informative=1, bias = 150.0,
22                             noise = 30, random_state=0)
23 plt.scatter(X_R1, y_R1, marker= 'o', s=50)
24 plt.show()
25 
26 
27 # 复杂(多参)的回归数据产生
28 from sklearn.datasets import make_friedman1
29 plt.figure()
30 plt.title('Complex regression problem with one input variable')
31 X_F1, y_F1 = make_friedman1(n_samples = 100,
32                            n_features = 7, random_state=0)
33 
34 plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)
35 plt.show()
36 
37 # 分类模型的数据生成
38 plt.figure()
39 plt.title('Sample binary classification problem with two informative features')
40 X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
41                                 n_redundant=0, n_informative=2,
42                                 n_clusters_per_class=1, flip_y = 0.1,
43                                 class_sep = 0.5, random_state=0)
44 plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2,
45            marker= 'o', s=50, cmap=cmap_bold)
46 plt.show()
47 
48 
49 # more difficult synthetic dataset for classification (binary) 
50 # with classes that are not linearly separable
51 X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,
52                        cluster_std = 1.3, random_state = 4)
53 y_D2 = y_D2 % 2
54 plt.figure()
55 plt.title('Sample binary classification problem with non-linearly separable classes')
56 plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,
57            marker= 'o', s=50, cmap=cmap_bold)
58 plt.show()
59 
60 
61 # 乳腺癌分类数据集
62 cancer = load_breast_cancer()
63 (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
64 
65 
66 # Communities and Crime dataset
67 (X_crime, y_crime) = load_crime_dataset()

KNN分类

1 from adspy_shared_utilities import plot_two_class_knn
2 
3 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,
4                                                    random_state=0)
5 
6 #k为knn所选最近邻居个数
7 plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)
8 plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test)
9 plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)

KNN回归预测

1 from sklearn.neighbors import KNeighborsRegressor
2 
3 X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state = 0)
4 
5 knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)
6 
7 print(knnreg.predict(X_test))
8 print('R-squared test score: {:.3f}'
9      .format(knnreg.score(X_test, y_test)))
[ 231.71  148.36  150.59  150.59   72.15  166.51  141.91  235.57  208.26
  102.1   191.32  134.5   228.32  148.36  159.17  113.47  144.04  199.23
  143.19  166.51  231.71  208.26  128.02  123.14  141.91]
R-squared test score: 0.425

#检验k对KNN预测模型结果的影响
1
fig, subaxes = plt.subplots(1, 2, figsize=(8,4)) 2 X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1) 3 X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0) 4 5 for thisaxis, K in zip(subaxes, [1, 3]): 6 knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train) 7 y_predict_output = knnreg.predict(X_predict_input) 8 thisaxis.set_xlim([-2.5, 0.75]) 9 thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10, 10 label='Predicted', alpha=0.8) 11 thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8) 12 thisaxis.set_xlabel('Input feature') 13 thisaxis.set_ylabel('Target value') 14 thisaxis.set_title('KNN regression (K={})'.format(K)) 15 thisaxis.legend() 16 plt.tight_layout()

 1 # plot k-NN regression on sample dataset for different values of K
 2 fig, subaxes = plt.subplots(5, 1, figsize=(5,20))
  #生成(-3,3)区间内500个数据
3 X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1) 4 X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, 5 random_state = 0) 6 7 for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]): 8 knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train) 9 y_predict_output = knnreg.predict(X_predict_input) 10 train_score = knnreg.score(X_train, y_train) 11 test_score = knnreg.score(X_test, y_test)
    #通过下面这个plot画出线条(其实是离散的点较密形成的)
12 thisaxis.plot(X_predict_input, y_predict_output) 13 thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train') 14 thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test') 15 thisaxis.set_xlabel('Input feature') 16 thisaxis.set_ylabel('Target value') 17 thisaxis.set_title('KNN Regression (K={})\n\ 18 Train $R^2 = {:.3f}$, Test $R^2 = {:.3f}$' 19 .format(K, train_score, test_score)) 20 thisaxis.legend() 21 plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

 KNN参数k对回归预测的影响

 

线性回归预测模型

 1 from sklearn.linear_model import LinearRegression
 2 
 3 X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,
 4                                                    random_state = 0)
 5 linreg = LinearRegression().fit(X_train, y_train)
 6 
 7 #coef_:偏置参数
 8 print('linear model coeff (w): {}'
 9      .format(linreg.coef_))
10 #intercept_:各个参数前面的权重
11 #(intercept_[0]*x[0]+...+intercept_[n]*x[n] = y
12 print('linear model intercept (b): {:.3f}'
13      .format(linreg.intercept_))
14 print('R-squared score (training): {:.3f}'
15      .format(linreg.score(X_train, y_train)))
16 print('R-squared score (test): {:.3f}'
17      .format(linreg.score(X_test, y_test)))
linear model coeff (w): [ 45.71]
linear model intercept (b): 148.446
R-squared score (training): 0.679
R-squared score (test): 0.492

线性回归图示

1 plt.figure(figsize=(5,4))
2 plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8)
3 #画出拟合出来的直线
4 plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')
5 plt.title('Least-squares linear regression')
6 plt.xlabel('Feature value (x)')
7 plt.ylabel('Target value (y)')
8 plt.show()

多元线性回归预测

 1 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
 2                                                    random_state = 0)
 3 linreg = LinearRegression().fit(X_train, y_train)
 4 
 5 print('Crime dataset')
 6 print('linear model intercept: {}'
 7      .format(linreg.intercept_))
 8 print('linear model coeff:\n{}'
 9      .format(linreg.coef_))
10 print('R-squared score (training): {:.3f}'
11      .format(linreg.score(X_train, y_train)))
12 print('R-squared score (test): {:.3f}'
13      .format(linreg.score(X_test, y_test)))
linear model intercept: 3861.708902399444
linear model coeff:
[  1.62e-03  -1.03e+02   1.61e+01  -2.94e+01  -1.92e+00  -1.47e+01
  -2.41e-03   1.46e+00  -1.46e-02  -1.08e+01   4.35e+01  -6.92e+00
   4.95e+00  -4.11e+00  -3.63e+00   8.98e-03   8.33e-03   4.84e-03
  -5.25e+00  -1.59e+01   7.47e+00   2.31e+00  -2.48e-01   1.22e+01
  -2.90e+00  -1.49e+00   4.96e+00   5.21e+00   1.82e+02   1.15e+01
   1.54e+02  -3.40e+02  -1.22e+02   2.75e+00  -2.87e+01   2.39e+00
   9.44e-01   3.18e+00  -1.17e+01  -5.46e-03   4.24e+01  -1.10e-03
  -9.23e-01   5.13e+00  -4.69e+00   1.13e+00  -1.70e+01  -5.00e+01
   5.64e+01  -2.94e+01   3.42e-01  -3.10e+01   2.89e+01  -5.46e+01
   6.75e+02   8.54e+01  -3.35e+02  -3.17e+01   2.96e+01   7.07e+00
   7.46e+01   2.01e-02  -3.96e-01   3.15e+01   1.00e+01  -1.60e+00
  -5.63e-01   2.82e+00  -2.96e+01   1.08e+11  -1.01e-03  -1.08e+11
   1.08e+11  -3.13e+08  -4.95e-01   3.13e+08  -3.13e+08   1.47e+00
  -2.78e+00   1.12e+00  -3.70e+01   1.09e-01   3.07e-01   2.06e+01
   9.24e-01  -6.05e-01  -1.92e+00   5.88e-01]
R-squared score (training): 0.668
R-squared score (test): 0.520

 岭回归

 1 from sklearn.linear_model import Ridge
 2 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
 3                                                    random_state = 0)
 4 #alpha为岭回归的正则化系数
 5 linridge = Ridge(alpha=20.0).fit(X_train, y_train)
 6 
 7 print('Crime dataset')
 8 print('ridge regression linear model intercept: {}'
 9      .format(linridge.intercept_))
10 print('ridge regression linear model coeff:\n{}'
11      .format(linridge.coef_))
12 print('R-squared score (training): {:.3f}'
13      .format(linridge.score(X_train, y_train)))
14 print('R-squared score (test): {:.3f}'
15      .format(linridge.score(X_test, y_test)))
16 print('Number of non-zero features: {}'
17      .format(np.sum(linridge.coef_ != 0)))

 

Crime dataset
ridge regression linear model intercept: -3352.4230358464793
ridge regression linear model coeff:
[  1.95e-03   2.19e+01   9.56e+00  -3.59e+01   6.36e+00  -1.97e+01
  -2.81e-03   1.66e+00  -6.61e-03  -6.95e+00   1.72e+01  -5.63e+00
   8.84e+00   6.79e-01  -7.34e+00   6.70e-03   9.79e-04   5.01e-03
  -4.90e+00  -1.79e+01   9.18e+00  -1.24e+00   1.22e+00   1.03e+01
  -3.78e+00  -3.73e+00   4.75e+00   8.43e+00   3.09e+01   1.19e+01
  -2.05e+00  -3.82e+01   1.85e+01   1.53e+00  -2.20e+01   2.46e+00
   3.29e-01   4.02e+00  -1.13e+01  -4.70e-03   4.27e+01  -1.23e-03
   1.41e+00   9.35e-01  -3.00e+00   1.12e+00  -1.82e+01  -1.55e+01
   2.42e+01  -1.32e+01  -4.20e-01  -3.60e+01   1.30e+01  -2.81e+01
   4.39e+01   3.87e+01  -6.46e+01  -1.64e+01   2.90e+01   4.15e+00
   5.34e+01   1.99e-02  -5.47e-01   1.24e+01   1.04e+01  -1.57e+00
   3.16e+00   8.78e+00  -2.95e+01  -2.34e-04   3.14e-04  -4.13e-04
  -1.80e-04  -5.74e-01  -5.18e-01  -4.21e-01   1.53e-01   1.33e+00
   3.85e+00   3.03e+00  -3.78e+01   1.38e-01   3.08e-01   1.57e+01
   3.31e-01   3.36e+00   1.61e-01  -2.68e+00]
R-squared score (training): 0.671
R-squared score (test): 0.494
Number of non-zero features: 88


岭回归使用归一化变量
 1 from sklearn.preprocessing import MinMaxScaler
 2 scaler = MinMaxScaler()
 3 
 4 from sklearn.linear_model import Ridge
 5 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
 6                                                    random_state = 0)
 7 
 8 #数据归一化
 9 X_train_scaled = scaler.fit_transform(X_train)
10 X_test_scaled = scaler.transform(X_test)
11 
12 linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)
13 
14 print('Crime dataset')
15 print('ridge regression linear model intercept: {}'
16      .format(linridge.intercept_))
17 print('ridge regression linear model coeff:\n{}'
18      .format(linridge.coef_))
19 print('R-squared score (training): {:.3f}'
20      .format(linridge.score(X_train_scaled, y_train)))
21 print('R-squared score (test): {:.3f}'
22      .format(linridge.score(X_test_scaled, y_test)))
23 print('Number of non-zero features: {}'
24      .format(np.sum(linridge.coef_ != 0)))
Crime dataset
ridge regression linear model intercept: 933.3906385044113
ridge regression linear model coeff:
[  88.69   16.49  -50.3   -82.91  -65.9    -2.28   87.74  150.95   18.88
  -31.06  -43.14 -189.44   -4.53  107.98  -76.53    2.86   34.95   90.14
   52.46  -62.11  115.02    2.67    6.94   -5.67 -101.55  -36.91   -8.71
   29.12  171.26   99.37   75.07  123.64   95.24 -330.61 -442.3  -284.5
 -258.37   17.66 -101.71  110.65  523.14   24.82    4.87  -30.47   -3.52
   50.58   10.85   18.28   44.11   58.34   67.09  -57.94  116.14   53.81
   49.02   -7.62   55.14  -52.09  123.39   77.13   45.5   184.91  -91.36
    1.08  234.09   10.39   94.72  167.92  -25.14   -1.18   14.6    36.77
   53.2   -78.86   -5.9    26.05  115.15   68.74   68.29   16.53  -97.91
  205.2    75.97   61.38  -79.83   67.27   95.67  -11.88]
R-squared score (training): 0.615
R-squared score (test): 0.599
Number of non-zero features: 88

正则化参数的岭回归的影响
 1 print('Ridge regression: effect of alpha regularization parameter\n')
 2 #改变alpha(正则化参数)
 3 for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
 4     linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
 5     r2_train = linridge.score(X_train_scaled, y_train)
 6     r2_test = linridge.score(X_test_scaled, y_test)
 7     num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
 8     print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \
 9 r-squared training: {:.2f}, r-squared test: {:.2f}\n'
10          .format(this_alpha, num_coeff_bigger, r2_train, r2_test))
Ridge regression: effect of alpha regularization parameter

Alpha = 0.00
num abs(coeff) > 1.0: 87, r-squared training: 0.67, r-squared test: 0.50

Alpha = 1.00
num abs(coeff) > 1.0: 87, r-squared training: 0.66, r-squared test: 0.56

Alpha = 10.00
num abs(coeff) > 1.0: 87, r-squared training: 0.63, r-squared test: 0.59

Alpha = 20.00
num abs(coeff) > 1.0: 88, r-squared training: 0.61, r-squared test: 0.60

Alpha = 50.00
num abs(coeff) > 1.0: 86, r-squared training: 0.58, r-squared test: 0.58

Alpha = 100.00
num abs(coeff) > 1.0: 87, r-squared training: 0.55, r-squared test: 0.55

Alpha = 1000.00
num abs(coeff) > 1.0: 84, r-squared training: 0.31, r-squared test: 0.30


Lasso 回归

 1 from sklearn.linear_model import Lasso
 2 from sklearn.preprocessing import MinMaxScaler
 3 scaler = MinMaxScaler()
 4 
 5 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
 6                                                    random_state = 0)
 7 
 8 X_train_scaled = scaler.fit_transform(X_train)
 9 X_test_scaled = scaler.transform(X_test)
10 
11 linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)
12 
13 print('Crime dataset')
14 print('lasso regression linear model intercept: {}'
15      .format(linlasso.intercept_))
16 print('lasso regression linear model coeff:\n{}'
17      .format(linlasso.coef_))
18 print('Non-zero features: {}'
19      .format(np.sum(linlasso.coef_ != 0)))
20 print('R-squared score (training): {:.3f}'
21      .format(linlasso.score(X_train_scaled, y_train)))
22 print('R-squared score (test): {:.3f}\n'
23      .format(linlasso.score(X_test_scaled, y_test)))
24 print('Features with non-zero weight (sorted by absolute magnitude):')
25 
26 for e in sorted (list(zip(list(X_crime), linlasso.coef_)),
27                 key = lambda e: -abs(e[1])):
28     if e[1] != 0:
29         print('\t{}, {:.3f}'.format(e[0], e[1]))
Crime dataset
lasso regression linear model intercept: 1186.6120619985809
lasso regression linear model coeff:
[    0.       0.      -0.    -168.18    -0.      -0.       0.     119.69
     0.      -0.       0.    -169.68    -0.       0.      -0.       0.
     0.       0.      -0.      -0.       0.      -0.       0.       0.
   -57.53    -0.      -0.       0.     259.33    -0.       0.       0.
     0.      -0.   -1188.74    -0.      -0.      -0.    -231.42     0.
  1488.37     0.      -0.      -0.      -0.       0.       0.       0.
     0.       0.      -0.       0.      20.14     0.       0.       0.
     0.       0.     339.04     0.       0.     459.54    -0.       0.
   122.69    -0.      91.41     0.      -0.       0.       0.      73.14
     0.      -0.       0.       0.      86.36     0.       0.       0.
  -104.57   264.93     0.      23.45   -49.39     0.       5.2      0.  ]
Non-zero features: 20
R-squared score (training): 0.631
R-squared score (test): 0.624

Features with non-zero weight (sorted by absolute magnitude):
	PctKidsBornNeverMar, 1488.365
	PctKids2Par, -1188.740
	HousVacant, 459.538
	PctPersDenseHous, 339.045
	NumInShelters, 264.932
	MalePctDivorce, 259.329
	PctWorkMom, -231.423
	pctWInvInc, -169.676
	agePct12t29, -168.183
	PctVacantBoarded, 122.692
	pctUrban, 119.694
	MedOwnCostPctIncNoMtg, -104.571
	MedYrHousBuilt, 91.412
	RentQrange, 86.356
	OwnOccHiQuart, 73.144
	PctEmplManu, -57.530
	PctBornSameState, -49.394
	PctForeignBorn, 23.449
	PctLargHouseFam, 20.144
	PctSameCity85, 5.198

k(正则化系数)对Lasso回归的影响
 1 print('Lasso regression: effect of alpha regularization\n\
 2 parameter on number of features kept in final model\n')
 3 
 4 for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
 5     linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)
 6     r2_train = linlasso.score(X_train_scaled, y_train)
 7     r2_test = linlasso.score(X_test_scaled, y_test)
 8     
 9     print('Alpha = {:.2f}\nFeatures kept: {}, r-squared training: {:.2f}, \
10 r-squared test: {:.2f}\n'
11          .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))
Lasso regression: effect of alpha regularization
parameter on number of features kept in final model

Alpha = 0.50
Features kept: 35, r-squared training: 0.65, r-squared test: 0.58

Alpha = 1.00
Features kept: 25, r-squared training: 0.64, r-squared test: 0.60

Alpha = 2.00
Features kept: 20, r-squared training: 0.63, r-squared test: 0.62

Alpha = 3.00
Features kept: 17, r-squared training: 0.62, r-squared test: 0.63

Alpha = 5.00
Features kept: 12, r-squared training: 0.60, r-squared test: 0.61

Alpha = 10.00
Features kept: 6, r-squared training: 0.57, r-squared test: 0.58

Alpha = 20.00
Features kept: 2, r-squared training: 0.51, r-squared test: 0.50

Alpha = 50.00
Features kept: 1, r-squared training: 0.31, r-squared test: 0.30

多元回归
 1 from sklearn.linear_model import LinearRegression
 2 from sklearn.linear_model import Ridge
 3 from sklearn.preprocessing import PolynomialFeatures
 4 
 5 
 6 X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1,
 7                                                    random_state = 0)
 8 linreg = LinearRegression().fit(X_train, y_train)
 9 
10 print('linear model coeff (w): {}'
11      .format(linreg.coef_))
12 print('linear model intercept (b): {:.3f}'
13      .format(linreg.intercept_))
14 print('R-squared score (training): {:.3f}'
15      .format(linreg.score(X_train, y_train)))
16 print('R-squared score (test): {:.3f}'
17      .format(linreg.score(X_test, y_test)))
18 
19 print('对参数开方,使参数变成平方的表示形式')
20 poly = PolynomialFeatures(degree=2)
21 X_F1_poly = poly.fit_transform(X_F1)
22 
23 X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,
24                                                    random_state = 0)
25 linreg = LinearRegression().fit(X_train, y_train)
26 
27 print('(poly deg 2) linear model coeff (w):\n{}'
28      .format(linreg.coef_))
29 print('(poly deg 2) linear model intercept (b): {:.3f}'
30      .format(linreg.intercept_))
31 print('(poly deg 2) R-squared score (training): {:.3f}'
32      .format(linreg.score(X_train, y_train)))
33 print('(poly deg 2) R-squared score (test): {:.3f}\n'
34      .format(linreg.score(X_test, y_test)))
35 
36 print('使用平方的参数,会很容易导致过拟合,通过对平方参数加上\n
37 正则化,减少过拟合情况,类似岭回归')
38 
39 X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,
40                                                    random_state = 0)
41 linreg = Ridge().fit(X_train, y_train)
42 
43 print('(poly deg 2 + ridge) linear model coeff (w):\n{}'
44      .format(linreg.coef_))
45 print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'
46      .format(linreg.intercept_))
47 print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'
48      .format(linreg.score(X_train, y_train)))
49 print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'
50      .format(linreg.score(X_test, y_test)))
linear model coeff (w): [  4.42   6.     0.53  10.24   6.55  -2.02  -0.32]
linear model intercept (b): 1.543
R-squared score (training): 0.722
R-squared score (test): 0.722

对参数开方,使参数变成平方的表示形式

(poly deg 2) linear model coeff (w):
[  3.41e-12   1.66e+01   2.67e+01  -2.21e+01   1.24e+01   6.93e+00
   1.05e+00   3.71e+00  -1.34e+01  -5.73e+00   1.62e+00   3.66e+00
   5.05e+00  -1.46e+00   1.95e+00  -1.51e+01   4.87e+00  -2.97e+00
  -7.78e+00   5.15e+00  -4.65e+00   1.84e+01  -2.22e+00   2.17e+00
  -1.28e+00   1.88e+00   1.53e-01   5.62e-01  -8.92e-01  -2.18e+00
   1.38e+00  -4.90e+00  -2.24e+00   1.38e+00  -5.52e-01  -1.09e+00]
(poly deg 2) linear model intercept (b): -3.206
(poly deg 2) R-squared score (training): 0.969
(poly deg 2) R-squared score (test): 0.805

使用平方的参数,会很容易导致过拟合,通过对平方参数加上
正则化,减少过拟合情况,类似岭回归

(poly deg 2 + ridge) linear model coeff (w):
[ 0.    2.23  4.73 -3.15  3.86  1.61 -0.77 -0.15 -1.75  1.6   1.37  2.52
  2.72  0.49 -1.94 -1.63  1.51  0.89  0.26  2.05 -1.93  3.62 -0.72  0.63
 -3.16  1.29  3.55  1.73  0.94 -0.51  1.7  -1.98  1.81 -0.22  2.88 -0.89]
(poly deg 2 + ridge) linear model intercept (b): 5.418
(poly deg 2 + ridge) R-squared score (training): 0.826
(poly deg 2 + ridge) R-squared score (test): 0.825


线性模型用于分类
逻辑回归
使用高度,跨度作为参数进行水果类型分类
 1 from sklearn.linear_model import LogisticRegression
 2 from adspy_shared_utilities import (
 3 plot_class_regions_for_classifier_subplot)
 4 
 5 fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
 6 y_fruits_apple = y_fruits_2d == 1   # make into a binary problem: apples vs everything else
  #as_matrix()把表格转化成矩阵,方便计算
7 X_train, X_test, y_train, y_test = ( 8 train_test_split(X_fruits_2d.as_matrix(), 9 y_fruits_apple.as_matrix(), 10 random_state = 0)) 11 12 clf = LogisticRegression(C=100).fit(X_train, y_train) 13 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, 14 None, 'Logistic regression \ 15 for binary classification\nFruit dataset: Apple vs others', 16 subaxes) 17 18 h = 6 19 w = 8 20 print('A fruit with height {} and width {} is predicted to be: {}' 21 .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]])) 22 23 h = 10 24 w = 7 25 print('A fruit with height {} and width {} is predicted to be: {}' 26 .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]])) 27 subaxes.set_xlabel('height') 28 subaxes.set_ylabel('width') 29 30 print('Accuracy of Logistic regression classifier on training set: {:.2f}' 31 .format(clf.score(X_train, y_train))) 32 print('Accuracy of Logistic regression classifier on test set: {:.2f}' 33 .format(clf.score(X_test, y_test)))

A fruit with height 6 and width 8 is predicted to be: an apple
A fruit with height 10 and width 7 is predicted to be: not an apple
Accuracy of Logistic regression classifier on training set: 0.77
Accuracy of Logistic regression classifier on test set: 0.73

 1 from sklearn.linear_model import LogisticRegression
 2 from adspy_shared_utilities import (
 3 plot_class_regions_for_classifier_subplot)
 4 
 5 
 6 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,
 7                                                    random_state = 0)
 8 
 9 fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
10 clf = LogisticRegression().fit(X_train, y_train)
11 title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)
12 plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
13                                          None, None, title, subaxes)
14 
15 print('Accuracy of Logistic regression classifier on training set: {:.2f}'
16      .format(clf.score(X_train, y_train)))
17 print('Accuracy of Logistic regression classifier on test set: {:.2f}'
18      .format(clf.score(X_test, y_test)))
19      

Accuracy of Logistic regression classifier on training set: 0.80
Accuracy of Logistic regression classifier on test set: 0.80


逻辑回归正则化参数的影响

 1 X_train, X_test, y_train, y_test = (
 2 train_test_split(X_fruits_2d.as_matrix(),
 3                 y_fruits_apple.as_matrix(),
 4                 random_state=0))
 5 
 6 fig, subaxes = plt.subplots(3, 1, figsize=(4, 10))
 7 
 8 for this_C, subplot in zip([0.1, 1, 100], subaxes):
 9     clf = LogisticRegression(C=this_C).fit(X_train, y_train)
10     title ='Logistic regression (apple vs rest), C = {:.3f}'.format(this_C)
11     
12     plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
13                                              X_test, y_test, title,
14                                              subplot)
15 plt.tight_layout()

应用于真实数据

 1 from sklearn.linear_model import LogisticRegression
 2 
 3 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
 4 
 5 clf = LogisticRegression().fit(X_train, y_train)
 6 print('Breast cancer dataset')
 7 print('Accuracy of Logistic regression classifier on training set: {:.2f}'
 8      .format(clf.score(X_train, y_train)))
 9 print('Accuracy of Logistic regression classifier on test set: {:.2f}'
10      .format(clf.score(X_test, y_test)))
Breast cancer dataset
Accuracy of Logistic regression classifier on training set: 0.96
Accuracy of Logistic regression classifier on test set: 0.96


SVM

线性SVM

 1 from sklearn.svm import SVC
 2 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
 3 
 4 
 5 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
 6 
 7 fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
 8 this_C = 1.0
 9 #线性核函数
10 clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train)
11 title = 'Linear SVC, C = {:.3f}'.format(this_C)
12 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)

Linear Support Vector Machine: C parameter

 1 from sklearn.svm import LinearSVC
 2 from adspy_shared_utilities import plot_class_regions_for_classifier
 3 
 4 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
 5 fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))
 6 
 7 for this_C, subplot in zip([0.00001, 100], subaxes):
 8     clf = LinearSVC(C=this_C).fit(X_train, y_train)
 9     title = 'Linear SVC, C = {:.5f}'.format(this_C)
10     plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
11                                              None, None, title, subplot)
12 plt.tight_layout()

线性SVM使用于真实数据

1 from sklearn.svm import LinearSVC
2 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
3 
4 clf = LinearSVC().fit(X_train, y_train)
5 print('Breast cancer dataset')
6 print('Accuracy of Linear SVC classifier on training set: {:.2f}'
7      .format(clf.score(X_train, y_train)))
8 print('Accuracy of Linear SVC classifier on test set: {:.2f}'
9      .format(clf.score(X_test, y_test)))
Breast cancer dataset
Accuracy of Linear SVC classifier on training set: 0.74
Accuracy of Linear SVC classifier on test set: 0.74


使用线性模型进行多分类任务

1 from sklearn.svm import LinearSVC
2 
3 X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)
4 
5 clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train)
6 print('Coefficients:\n', clf.coef_)
7 print('Intercepts:\n', clf.intercept_)
Coefficients:
 [[-0.26  0.71]
 [-1.63  1.16]
 [ 0.03  0.29]
 [ 1.24 -1.64]]
Intercepts:
 [-3.29  1.2  -2.72  1.16]

 在水果数据集上使用多分类

 1 plt.figure(figsize=(6,6))
 2 colors = ['r', 'g', 'b', 'y']
 3 cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00'])
 4 
 5 plt.scatter(X_fruits_2d[['height']], X_fruits_2d[['width']],
 6            c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7)
 7 
 8 x_0_range = np.linspace(-10, 15)
 9 
10 for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']):
11     # Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b, 
12     # and the decision boundary is defined as being all points with y = 0, to plot x_1 as a 
13     # function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1:
14     plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8)
15     
16 plt.legend(target_names_fruits)
17 plt.xlabel('height')
18 plt.ylabel('width')
19 plt.xlim(-2, 12)
20 plt.ylim(-2, 15)
21 plt.show()

核化SVM

 

分类模型

 1 from sklearn.svm import SVC
 2 from adspy_shared_utilities import plot_class_regions_for_classifier
 3 
 4 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
 5 
 6 # RBF核函数
 7 plot_class_regions_for_classifier(SVC().fit(X_train, y_train),
 8                                  X_train, y_train, None, None,
 9                                  'Support Vector Classifier: RBF kernel')
10 
11 # 多项式核函数polynomial kernel, degree = 3
12 plot_class_regions_for_classifier(SVC(kernel = 'poly', degree = 3)
13                                  .fit(X_train, y_train), X_train,
14                                  y_train, None, None,
15                                  'Support Vector Classifier: Polynomial kernel, degree = 3')

γ参数对RBF核函数SVM的影响

 1 from adspy_shared_utilities import plot_class_regions_for_classifier
 2 
 3 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
 4 fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))
 5 
 6 for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes):
 7     clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)
 8     title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}'.format(this_gamma)
 9     plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
10                                              None, None, title, subplot)
11     plt.tight_layout()

 

γ和C对RBP核函数SVM的影响

 1 from sklearn.svm import SVC
 2 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
 3 
 4 from sklearn.model_selection import train_test_split
 5 
 6 
 7 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
 8 fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)
 9 
10 for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):
11     
12     for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):
13         title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)
14         clf = SVC(kernel = 'rbf', gamma = this_gamma,
15                  C = this_C).fit(X_train, y_train)
16         plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
17                                                  X_test, y_test, title,
18                                                  subplot)
19         plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

 

非标准化数据应用于SVM

 1 from sklearn.svm import SVC
 2 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
 3                                                    random_state = 0)
 4 
 5 clf = SVC(C=10).fit(X_train, y_train)
 6 print('Breast cancer dataset (unnormalized features)')
 7 print('Accuracy of RBF-kernel SVC on training set: {:.2f}'
 8      .format(clf.score(X_train, y_train)))
 9 print('Accuracy of RBF-kernel SVC on test set: {:.2f}'
10      .format(clf.score(X_test, y_test)))
Breast cancer dataset (unnormalized features)
Accuracy of RBF-kernel SVC on training set: 1.00
Accuracy of RBF-kernel SVC on test set: 0.63

 

 SVM应用于标准化数据

 1 from sklearn.preprocessing import MinMaxScaler
 2 scaler = MinMaxScaler()
 3 X_train_scaled = scaler.fit_transform(X_train)
 4 X_test_scaled = scaler.transform(X_test)
 5 
 6 clf = SVC(C=10).fit(X_train_scaled, y_train)
 7 print('Breast cancer dataset (normalized with MinMax scaling)')
 8 print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'
 9      .format(clf.score(X_train_scaled, y_train)))
10 print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'
11      .format(clf.score(X_test_scaled, y_test)))
Breast cancer dataset (normalized with MinMax scaling)
RBF-kernel SVC (with MinMax scaling) training set accuracy: 0.98
RBF-kernel SVC (with MinMax scaling) test set accuracy: 0.96

 交叉验证

 1 from sklearn.model_selection import cross_val_score
 2 
 3 clf = KNeighborsClassifier(n_neighbors = 5)
 4 X = X_fruits_2d.as_matrix()
 5 y = y_fruits_2d.as_matrix()
 6 #进行交叉验证
 7 cv_scores = cross_val_score(clf, X, y)
 8 
 9 print('Cross-validation scores (3-fold):', cv_scores)
10 print('Mean cross-validation score (3-fold): {:.3f}'
11      .format(np.mean(cv_scores)))
Cross-validation scores (3-fold): [ 0.77  0.74  0.83]
Mean cross-validation score (3-fold): 0.781


验证曲线

1 from sklearn.svm import SVC
2 from sklearn.model_selection import validation_curve
3 
4 param_range = np.logspace(-3, 3, 4)
5 train_scores, test_scores = validation_curve(SVC(), X, y,
6                                             param_name='gamma',
7                                             param_range=param_range, cv=3)
1 print(train_scores)
[[ 0.49  0.42  0.41]
 [ 0.84  0.72  0.76]
 [ 0.92  0.9   0.93]
 [ 1.    1.    0.98]]
1 print(test_scores)
[[ 0.45  0.32  0.33]
 [ 0.82  0.68  0.61]
 [ 0.41  0.84  0.67]
 [ 0.36  0.21  0.39]]
 1 # This code based on scikit-learn validation_plot example
 2 #  See:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
 3 plt.figure()
 4 
 5 train_scores_mean = np.mean(train_scores, axis=1)
 6 train_scores_std = np.std(train_scores, axis=1)
 7 test_scores_mean = np.mean(test_scores, axis=1)
 8 test_scores_std = np.std(test_scores, axis=1)
 9 
10 plt.title('Validation Curve with SVM')
11 plt.xlabel('$\gamma$ (gamma)')
12 plt.ylabel('Score')
13 plt.ylim(0.0, 1.1)
14 lw = 2
15 
16 plt.semilogx(param_range, train_scores_mean, label='Training score',
17             color='darkorange', lw=lw)
18 
19 plt.fill_between(param_range, train_scores_mean - train_scores_std,
20                 train_scores_mean + train_scores_std, alpha=0.2,
21                 color='darkorange', lw=lw)
22 
23 plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',
24             color='navy', lw=lw)
25 
26 plt.fill_between(param_range, test_scores_mean - test_scores_std,
27                 test_scores_mean + test_scores_std, alpha=0.2,
28                 color='navy', lw=lw)
29 
30 plt.legend(loc='best')
31 plt.show()

决策树

 1 from sklearn.datasets import load_iris
 2 from sklearn.tree import DecisionTreeClassifier
 3 from adspy_shared_utilities import plot_decision_tree
 4 from sklearn.model_selection import train_test_split
 5 
 6 
 7 iris = load_iris()
 8 
 9 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
10 clf = DecisionTreeClassifier().fit(X_train, y_train)
11 
12 print('Accuracy of Decision Tree classifier on training set: {:.2f}'
13      .format(clf.score(X_train, y_train)))
14 print('Accuracy of Decision Tree classifier on test set: {:.2f}'
15      .format(clf.score(X_test, y_test)))
Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.97
 设置树的深度避免过拟合
#max_depth设置决策树的最大深度
1
clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train) 2 3 print('Accuracy of Decision Tree classifier on training set: {:.2f}' 4 .format(clf2.score(X_train, y_train))) 5 print('Accuracy of Decision Tree classifier on test set: {:.2f}' 6 .format(clf2.score(X_test, y_test)))
Accuracy of Decision Tree classifier on training set: 0.98
Accuracy of Decision Tree classifier on test set: 0.97
可视化决策树
1 plot_decision_tree(clf, iris.feature_names, iris.target_names)

1 #决策树的最大深度为3
2 plot_decision_tree(clf2, iris.feature_names, iris.target_names)

变量的重要性
1 from adspy_shared_utilities import plot_feature_importances
2 
3 plt.figure(figsize=(10,4), dpi=80)
4 plot_feature_importances(clf, iris.feature_names)
5 plt.show()
6 
7 print('Feature importances: {}'.format(clf.feature_importances_))

Feature importances: [ 0.    0.02  0.56  0.42]

 1 from sklearn.tree import DecisionTreeClassifier
 2 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
 3 
 4 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0)
 5 fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))
 6 
 7 pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
 8 tree_max_depth = 4
 9 
10 for pair, axis in zip(pair_list, subaxes):
11     X = X_train[:, pair]
12     y = y_train
13     
14     clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
15     title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)
16     plot_class_regions_for_classifier_subplot(clf, X, y, None,
17                                              None, title, axis,
18                                              iris.target_names)
19     
20     axis.set_xlabel(iris.feature_names[pair[0]])
21     axis.set_ylabel(iris.feature_names[pair[1]])
22     
23 plt.tight_layout()
24 plt.show()

决策树应用于真实数据

 1 from sklearn.tree import DecisionTreeClassifier
 2 from adspy_shared_utilities import plot_decision_tree
 3 from adspy_shared_utilities import plot_feature_importances
 4 
 5 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
 6 
 7 clf = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,
 8                             random_state = 0).fit(X_train, y_train)
 9 
10 plot_decision_tree(clf, cancer.feature_names, cancer.target_names)

 

 

 1 print('Breast cancer dataset: decision tree')
 2 print('Accuracy of DT classifier on training set: {:.2f}'
 3      .format(clf.score(X_train, y_train)))
 4 print('Accuracy of DT classifier on test set: {:.2f}'
 5      .format(clf.score(X_test, y_test)))
 6 
 7 plt.figure(figsize=(10,6),dpi=80)
 8 plot_feature_importances(clf, cancer.feature_names)
 9 plt.tight_layout()
10 
11 plt.show()
Breast cancer dataset: decision tree
Accuracy of DT classifier on training set: 0.96
Accuracy of DT classifier on test set: 0.94

 





posted on 2018-03-11 22:57  郑哲  阅读(4466)  评论(0编辑  收藏  举报