一.数据产生
1 from sklearn.datasets import make_classification, make_blobs 2 from matplotlib.colors import ListedColormap 3 from sklearn.datasets import load_breast_cancer 4 from adspy_shared_utilities import load_crime_dataset 5 6 cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000']) 7 8 #make_regression:随机产生回归模型的数据 9 #参数:n_samples : 数据个数 10 #n_features:数据中变量个数 11 #n_informative:有关变量个数 12 #bias:线性模型中的偏差项 13 #noise:高斯分布的标准差 14 #random_state:随机数的种子生成器 15 16 # 简单(一个参数)的回归数据 17 from sklearn.datasets import make_regression 18 plt.figure() 19 plt.title('Sample regression problem with one input variable') 20 X_R1, y_R1 = make_regression(n_samples = 100, n_features=1, 21 n_informative=1, bias = 150.0, 22 noise = 30, random_state=0) 23 plt.scatter(X_R1, y_R1, marker= 'o', s=50) 24 plt.show() 25 26 27 # 复杂(多参)的回归数据产生 28 from sklearn.datasets import make_friedman1 29 plt.figure() 30 plt.title('Complex regression problem with one input variable') 31 X_F1, y_F1 = make_friedman1(n_samples = 100, 32 n_features = 7, random_state=0) 33 34 plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50) 35 plt.show() 36 37 # 分类模型的数据生成 38 plt.figure() 39 plt.title('Sample binary classification problem with two informative features') 40 X_C2, y_C2 = make_classification(n_samples = 100, n_features=2, 41 n_redundant=0, n_informative=2, 42 n_clusters_per_class=1, flip_y = 0.1, 43 class_sep = 0.5, random_state=0) 44 plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2, 45 marker= 'o', s=50, cmap=cmap_bold) 46 plt.show() 47 48 49 # more difficult synthetic dataset for classification (binary) 50 # with classes that are not linearly separable 51 X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8, 52 cluster_std = 1.3, random_state = 4) 53 y_D2 = y_D2 % 2 54 plt.figure() 55 plt.title('Sample binary classification problem with non-linearly separable classes') 56 plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2, 57 marker= 'o', s=50, cmap=cmap_bold) 58 plt.show() 59 60 61 # 乳腺癌分类数据集 62 cancer = load_breast_cancer() 63 (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True) 64 65 66 # Communities and Crime dataset 67 (X_crime, y_crime) = load_crime_dataset()
KNN分类
1 from adspy_shared_utilities import plot_two_class_knn 2 3 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, 4 random_state=0) 5 6 #k为knn所选最近邻居个数 7 plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test) 8 plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test) 9 plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)
KNN回归预测
1 from sklearn.neighbors import KNeighborsRegressor 2 3 X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state = 0) 4 5 knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train) 6 7 print(knnreg.predict(X_test)) 8 print('R-squared test score: {:.3f}' 9 .format(knnreg.score(X_test, y_test)))
[ 231.71 148.36 150.59 150.59 72.15 166.51 141.91 235.57 208.26 102.1 191.32 134.5 228.32 148.36 159.17 113.47 144.04 199.23 143.19 166.51 231.71 208.26 128.02 123.14 141.91] R-squared test score: 0.425
#检验k对KNN预测模型结果的影响
1 fig, subaxes = plt.subplots(1, 2, figsize=(8,4)) 2 X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1) 3 X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0) 4 5 for thisaxis, K in zip(subaxes, [1, 3]): 6 knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train) 7 y_predict_output = knnreg.predict(X_predict_input) 8 thisaxis.set_xlim([-2.5, 0.75]) 9 thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10, 10 label='Predicted', alpha=0.8) 11 thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8) 12 thisaxis.set_xlabel('Input feature') 13 thisaxis.set_ylabel('Target value') 14 thisaxis.set_title('KNN regression (K={})'.format(K)) 15 thisaxis.legend() 16 plt.tight_layout()
1 # plot k-NN regression on sample dataset for different values of K 2 fig, subaxes = plt.subplots(5, 1, figsize=(5,20))
#生成(-3,3)区间内500个数据 3 X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1) 4 X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, 5 random_state = 0) 6 7 for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]): 8 knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train) 9 y_predict_output = knnreg.predict(X_predict_input) 10 train_score = knnreg.score(X_train, y_train) 11 test_score = knnreg.score(X_test, y_test)
#通过下面这个plot画出线条(其实是离散的点较密形成的) 12 thisaxis.plot(X_predict_input, y_predict_output) 13 thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train') 14 thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test') 15 thisaxis.set_xlabel('Input feature') 16 thisaxis.set_ylabel('Target value') 17 thisaxis.set_title('KNN Regression (K={})\n\ 18 Train $R^2 = {:.3f}$, Test $R^2 = {:.3f}$' 19 .format(K, train_score, test_score)) 20 thisaxis.legend() 21 plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
KNN参数k对回归预测的影响
线性回归预测模型
1 from sklearn.linear_model import LinearRegression 2 3 X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, 4 random_state = 0) 5 linreg = LinearRegression().fit(X_train, y_train) 6 7 #coef_:偏置参数 8 print('linear model coeff (w): {}' 9 .format(linreg.coef_)) 10 #intercept_:各个参数前面的权重 11 #(intercept_[0]*x[0]+...+intercept_[n]*x[n] = y 12 print('linear model intercept (b): {:.3f}' 13 .format(linreg.intercept_)) 14 print('R-squared score (training): {:.3f}' 15 .format(linreg.score(X_train, y_train))) 16 print('R-squared score (test): {:.3f}' 17 .format(linreg.score(X_test, y_test)))
linear model coeff (w): [ 45.71] linear model intercept (b): 148.446 R-squared score (training): 0.679 R-squared score (test): 0.492
线性回归图示
1 plt.figure(figsize=(5,4)) 2 plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8) 3 #画出拟合出来的直线 4 plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-') 5 plt.title('Least-squares linear regression') 6 plt.xlabel('Feature value (x)') 7 plt.ylabel('Target value (y)') 8 plt.show()
多元线性回归预测
1 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, 2 random_state = 0) 3 linreg = LinearRegression().fit(X_train, y_train) 4 5 print('Crime dataset') 6 print('linear model intercept: {}' 7 .format(linreg.intercept_)) 8 print('linear model coeff:\n{}' 9 .format(linreg.coef_)) 10 print('R-squared score (training): {:.3f}' 11 .format(linreg.score(X_train, y_train))) 12 print('R-squared score (test): {:.3f}' 13 .format(linreg.score(X_test, y_test)))
linear model intercept: 3861.708902399444 linear model coeff: [ 1.62e-03 -1.03e+02 1.61e+01 -2.94e+01 -1.92e+00 -1.47e+01 -2.41e-03 1.46e+00 -1.46e-02 -1.08e+01 4.35e+01 -6.92e+00 4.95e+00 -4.11e+00 -3.63e+00 8.98e-03 8.33e-03 4.84e-03 -5.25e+00 -1.59e+01 7.47e+00 2.31e+00 -2.48e-01 1.22e+01 -2.90e+00 -1.49e+00 4.96e+00 5.21e+00 1.82e+02 1.15e+01 1.54e+02 -3.40e+02 -1.22e+02 2.75e+00 -2.87e+01 2.39e+00 9.44e-01 3.18e+00 -1.17e+01 -5.46e-03 4.24e+01 -1.10e-03 -9.23e-01 5.13e+00 -4.69e+00 1.13e+00 -1.70e+01 -5.00e+01 5.64e+01 -2.94e+01 3.42e-01 -3.10e+01 2.89e+01 -5.46e+01 6.75e+02 8.54e+01 -3.35e+02 -3.17e+01 2.96e+01 7.07e+00 7.46e+01 2.01e-02 -3.96e-01 3.15e+01 1.00e+01 -1.60e+00 -5.63e-01 2.82e+00 -2.96e+01 1.08e+11 -1.01e-03 -1.08e+11 1.08e+11 -3.13e+08 -4.95e-01 3.13e+08 -3.13e+08 1.47e+00 -2.78e+00 1.12e+00 -3.70e+01 1.09e-01 3.07e-01 2.06e+01 9.24e-01 -6.05e-01 -1.92e+00 5.88e-01] R-squared score (training): 0.668 R-squared score (test): 0.520
岭回归
1 from sklearn.linear_model import Ridge 2 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, 3 random_state = 0) 4 #alpha为岭回归的正则化系数 5 linridge = Ridge(alpha=20.0).fit(X_train, y_train) 6 7 print('Crime dataset') 8 print('ridge regression linear model intercept: {}' 9 .format(linridge.intercept_)) 10 print('ridge regression linear model coeff:\n{}' 11 .format(linridge.coef_)) 12 print('R-squared score (training): {:.3f}' 13 .format(linridge.score(X_train, y_train))) 14 print('R-squared score (test): {:.3f}' 15 .format(linridge.score(X_test, y_test))) 16 print('Number of non-zero features: {}' 17 .format(np.sum(linridge.coef_ != 0)))
Crime dataset ridge regression linear model intercept: -3352.4230358464793 ridge regression linear model coeff: [ 1.95e-03 2.19e+01 9.56e+00 -3.59e+01 6.36e+00 -1.97e+01 -2.81e-03 1.66e+00 -6.61e-03 -6.95e+00 1.72e+01 -5.63e+00 8.84e+00 6.79e-01 -7.34e+00 6.70e-03 9.79e-04 5.01e-03 -4.90e+00 -1.79e+01 9.18e+00 -1.24e+00 1.22e+00 1.03e+01 -3.78e+00 -3.73e+00 4.75e+00 8.43e+00 3.09e+01 1.19e+01 -2.05e+00 -3.82e+01 1.85e+01 1.53e+00 -2.20e+01 2.46e+00 3.29e-01 4.02e+00 -1.13e+01 -4.70e-03 4.27e+01 -1.23e-03 1.41e+00 9.35e-01 -3.00e+00 1.12e+00 -1.82e+01 -1.55e+01 2.42e+01 -1.32e+01 -4.20e-01 -3.60e+01 1.30e+01 -2.81e+01 4.39e+01 3.87e+01 -6.46e+01 -1.64e+01 2.90e+01 4.15e+00 5.34e+01 1.99e-02 -5.47e-01 1.24e+01 1.04e+01 -1.57e+00 3.16e+00 8.78e+00 -2.95e+01 -2.34e-04 3.14e-04 -4.13e-04 -1.80e-04 -5.74e-01 -5.18e-01 -4.21e-01 1.53e-01 1.33e+00 3.85e+00 3.03e+00 -3.78e+01 1.38e-01 3.08e-01 1.57e+01 3.31e-01 3.36e+00 1.61e-01 -2.68e+00] R-squared score (training): 0.671 R-squared score (test): 0.494 Number of non-zero features: 88
岭回归使用归一化变量
1 from sklearn.preprocessing import MinMaxScaler 2 scaler = MinMaxScaler() 3 4 from sklearn.linear_model import Ridge 5 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, 6 random_state = 0) 7 8 #数据归一化 9 X_train_scaled = scaler.fit_transform(X_train) 10 X_test_scaled = scaler.transform(X_test) 11 12 linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train) 13 14 print('Crime dataset') 15 print('ridge regression linear model intercept: {}' 16 .format(linridge.intercept_)) 17 print('ridge regression linear model coeff:\n{}' 18 .format(linridge.coef_)) 19 print('R-squared score (training): {:.3f}' 20 .format(linridge.score(X_train_scaled, y_train))) 21 print('R-squared score (test): {:.3f}' 22 .format(linridge.score(X_test_scaled, y_test))) 23 print('Number of non-zero features: {}' 24 .format(np.sum(linridge.coef_ != 0)))
Crime dataset ridge regression linear model intercept: 933.3906385044113 ridge regression linear model coeff: [ 88.69 16.49 -50.3 -82.91 -65.9 -2.28 87.74 150.95 18.88 -31.06 -43.14 -189.44 -4.53 107.98 -76.53 2.86 34.95 90.14 52.46 -62.11 115.02 2.67 6.94 -5.67 -101.55 -36.91 -8.71 29.12 171.26 99.37 75.07 123.64 95.24 -330.61 -442.3 -284.5 -258.37 17.66 -101.71 110.65 523.14 24.82 4.87 -30.47 -3.52 50.58 10.85 18.28 44.11 58.34 67.09 -57.94 116.14 53.81 49.02 -7.62 55.14 -52.09 123.39 77.13 45.5 184.91 -91.36 1.08 234.09 10.39 94.72 167.92 -25.14 -1.18 14.6 36.77 53.2 -78.86 -5.9 26.05 115.15 68.74 68.29 16.53 -97.91 205.2 75.97 61.38 -79.83 67.27 95.67 -11.88] R-squared score (training): 0.615 R-squared score (test): 0.599 Number of non-zero features: 88
正则化参数的岭回归的影响
1 print('Ridge regression: effect of alpha regularization parameter\n') 2 #改变alpha(正则化参数) 3 for this_alpha in [0, 1, 10, 20, 50, 100, 1000]: 4 linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train) 5 r2_train = linridge.score(X_train_scaled, y_train) 6 r2_test = linridge.score(X_test_scaled, y_test) 7 num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0) 8 print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \ 9 r-squared training: {:.2f}, r-squared test: {:.2f}\n' 10 .format(this_alpha, num_coeff_bigger, r2_train, r2_test))
Ridge regression: effect of alpha regularization parameter Alpha = 0.00 num abs(coeff) > 1.0: 87, r-squared training: 0.67, r-squared test: 0.50 Alpha = 1.00 num abs(coeff) > 1.0: 87, r-squared training: 0.66, r-squared test: 0.56 Alpha = 10.00 num abs(coeff) > 1.0: 87, r-squared training: 0.63, r-squared test: 0.59 Alpha = 20.00 num abs(coeff) > 1.0: 88, r-squared training: 0.61, r-squared test: 0.60 Alpha = 50.00 num abs(coeff) > 1.0: 86, r-squared training: 0.58, r-squared test: 0.58 Alpha = 100.00 num abs(coeff) > 1.0: 87, r-squared training: 0.55, r-squared test: 0.55 Alpha = 1000.00 num abs(coeff) > 1.0: 84, r-squared training: 0.31, r-squared test: 0.30
Lasso 回归
1 from sklearn.linear_model import Lasso 2 from sklearn.preprocessing import MinMaxScaler 3 scaler = MinMaxScaler() 4 5 X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, 6 random_state = 0) 7 8 X_train_scaled = scaler.fit_transform(X_train) 9 X_test_scaled = scaler.transform(X_test) 10 11 linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train) 12 13 print('Crime dataset') 14 print('lasso regression linear model intercept: {}' 15 .format(linlasso.intercept_)) 16 print('lasso regression linear model coeff:\n{}' 17 .format(linlasso.coef_)) 18 print('Non-zero features: {}' 19 .format(np.sum(linlasso.coef_ != 0))) 20 print('R-squared score (training): {:.3f}' 21 .format(linlasso.score(X_train_scaled, y_train))) 22 print('R-squared score (test): {:.3f}\n' 23 .format(linlasso.score(X_test_scaled, y_test))) 24 print('Features with non-zero weight (sorted by absolute magnitude):') 25 26 for e in sorted (list(zip(list(X_crime), linlasso.coef_)), 27 key = lambda e: -abs(e[1])): 28 if e[1] != 0: 29 print('\t{}, {:.3f}'.format(e[0], e[1]))
Crime dataset lasso regression linear model intercept: 1186.6120619985809 lasso regression linear model coeff: [ 0. 0. -0. -168.18 -0. -0. 0. 119.69 0. -0. 0. -169.68 -0. 0. -0. 0. 0. 0. -0. -0. 0. -0. 0. 0. -57.53 -0. -0. 0. 259.33 -0. 0. 0. 0. -0. -1188.74 -0. -0. -0. -231.42 0. 1488.37 0. -0. -0. -0. 0. 0. 0. 0. 0. -0. 0. 20.14 0. 0. 0. 0. 0. 339.04 0. 0. 459.54 -0. 0. 122.69 -0. 91.41 0. -0. 0. 0. 73.14 0. -0. 0. 0. 86.36 0. 0. 0. -104.57 264.93 0. 23.45 -49.39 0. 5.2 0. ] Non-zero features: 20 R-squared score (training): 0.631 R-squared score (test): 0.624 Features with non-zero weight (sorted by absolute magnitude): PctKidsBornNeverMar, 1488.365 PctKids2Par, -1188.740 HousVacant, 459.538 PctPersDenseHous, 339.045 NumInShelters, 264.932 MalePctDivorce, 259.329 PctWorkMom, -231.423 pctWInvInc, -169.676 agePct12t29, -168.183 PctVacantBoarded, 122.692 pctUrban, 119.694 MedOwnCostPctIncNoMtg, -104.571 MedYrHousBuilt, 91.412 RentQrange, 86.356 OwnOccHiQuart, 73.144 PctEmplManu, -57.530 PctBornSameState, -49.394 PctForeignBorn, 23.449 PctLargHouseFam, 20.144 PctSameCity85, 5.198
k(正则化系数)对Lasso回归的影响
1 print('Lasso regression: effect of alpha regularization\n\ 2 parameter on number of features kept in final model\n') 3 4 for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]: 5 linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train) 6 r2_train = linlasso.score(X_train_scaled, y_train) 7 r2_test = linlasso.score(X_test_scaled, y_test) 8 9 print('Alpha = {:.2f}\nFeatures kept: {}, r-squared training: {:.2f}, \ 10 r-squared test: {:.2f}\n' 11 .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))
Lasso regression: effect of alpha regularization parameter on number of features kept in final model Alpha = 0.50 Features kept: 35, r-squared training: 0.65, r-squared test: 0.58 Alpha = 1.00 Features kept: 25, r-squared training: 0.64, r-squared test: 0.60 Alpha = 2.00 Features kept: 20, r-squared training: 0.63, r-squared test: 0.62 Alpha = 3.00 Features kept: 17, r-squared training: 0.62, r-squared test: 0.63 Alpha = 5.00 Features kept: 12, r-squared training: 0.60, r-squared test: 0.61 Alpha = 10.00 Features kept: 6, r-squared training: 0.57, r-squared test: 0.58 Alpha = 20.00 Features kept: 2, r-squared training: 0.51, r-squared test: 0.50 Alpha = 50.00 Features kept: 1, r-squared training: 0.31, r-squared test: 0.30
多元回归
1 from sklearn.linear_model import LinearRegression 2 from sklearn.linear_model import Ridge 3 from sklearn.preprocessing import PolynomialFeatures 4 5 6 X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1, 7 random_state = 0) 8 linreg = LinearRegression().fit(X_train, y_train) 9 10 print('linear model coeff (w): {}' 11 .format(linreg.coef_)) 12 print('linear model intercept (b): {:.3f}' 13 .format(linreg.intercept_)) 14 print('R-squared score (training): {:.3f}' 15 .format(linreg.score(X_train, y_train))) 16 print('R-squared score (test): {:.3f}' 17 .format(linreg.score(X_test, y_test))) 18 19 print('对参数开方,使参数变成平方的表示形式') 20 poly = PolynomialFeatures(degree=2) 21 X_F1_poly = poly.fit_transform(X_F1) 22 23 X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, 24 random_state = 0) 25 linreg = LinearRegression().fit(X_train, y_train) 26 27 print('(poly deg 2) linear model coeff (w):\n{}' 28 .format(linreg.coef_)) 29 print('(poly deg 2) linear model intercept (b): {:.3f}' 30 .format(linreg.intercept_)) 31 print('(poly deg 2) R-squared score (training): {:.3f}' 32 .format(linreg.score(X_train, y_train))) 33 print('(poly deg 2) R-squared score (test): {:.3f}\n' 34 .format(linreg.score(X_test, y_test))) 35 36 print('使用平方的参数,会很容易导致过拟合,通过对平方参数加上\n 37 正则化,减少过拟合情况,类似岭回归') 38 39 X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, 40 random_state = 0) 41 linreg = Ridge().fit(X_train, y_train) 42 43 print('(poly deg 2 + ridge) linear model coeff (w):\n{}' 44 .format(linreg.coef_)) 45 print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}' 46 .format(linreg.intercept_)) 47 print('(poly deg 2 + ridge) R-squared score (training): {:.3f}' 48 .format(linreg.score(X_train, y_train))) 49 print('(poly deg 2 + ridge) R-squared score (test): {:.3f}' 50 .format(linreg.score(X_test, y_test)))
linear model coeff (w): [ 4.42 6. 0.53 10.24 6.55 -2.02 -0.32] linear model intercept (b): 1.543 R-squared score (training): 0.722 R-squared score (test): 0.722 对参数开方,使参数变成平方的表示形式 (poly deg 2) linear model coeff (w): [ 3.41e-12 1.66e+01 2.67e+01 -2.21e+01 1.24e+01 6.93e+00 1.05e+00 3.71e+00 -1.34e+01 -5.73e+00 1.62e+00 3.66e+00 5.05e+00 -1.46e+00 1.95e+00 -1.51e+01 4.87e+00 -2.97e+00 -7.78e+00 5.15e+00 -4.65e+00 1.84e+01 -2.22e+00 2.17e+00 -1.28e+00 1.88e+00 1.53e-01 5.62e-01 -8.92e-01 -2.18e+00 1.38e+00 -4.90e+00 -2.24e+00 1.38e+00 -5.52e-01 -1.09e+00] (poly deg 2) linear model intercept (b): -3.206 (poly deg 2) R-squared score (training): 0.969 (poly deg 2) R-squared score (test): 0.805 使用平方的参数,会很容易导致过拟合,通过对平方参数加上 正则化,减少过拟合情况,类似岭回归 (poly deg 2 + ridge) linear model coeff (w): [ 0. 2.23 4.73 -3.15 3.86 1.61 -0.77 -0.15 -1.75 1.6 1.37 2.52 2.72 0.49 -1.94 -1.63 1.51 0.89 0.26 2.05 -1.93 3.62 -0.72 0.63 -3.16 1.29 3.55 1.73 0.94 -0.51 1.7 -1.98 1.81 -0.22 2.88 -0.89] (poly deg 2 + ridge) linear model intercept (b): 5.418 (poly deg 2 + ridge) R-squared score (training): 0.826 (poly deg 2 + ridge) R-squared score (test): 0.825
线性模型用于分类
逻辑回归
使用高度,跨度作为参数进行水果类型分类
1 from sklearn.linear_model import LogisticRegression 2 from adspy_shared_utilities import ( 3 plot_class_regions_for_classifier_subplot) 4 5 fig, subaxes = plt.subplots(1, 1, figsize=(7, 5)) 6 y_fruits_apple = y_fruits_2d == 1 # make into a binary problem: apples vs everything else #as_matrix()把表格转化成矩阵,方便计算
7 X_train, X_test, y_train, y_test = ( 8 train_test_split(X_fruits_2d.as_matrix(), 9 y_fruits_apple.as_matrix(), 10 random_state = 0)) 11 12 clf = LogisticRegression(C=100).fit(X_train, y_train) 13 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, 14 None, 'Logistic regression \ 15 for binary classification\nFruit dataset: Apple vs others', 16 subaxes) 17 18 h = 6 19 w = 8 20 print('A fruit with height {} and width {} is predicted to be: {}' 21 .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]])) 22 23 h = 10 24 w = 7 25 print('A fruit with height {} and width {} is predicted to be: {}' 26 .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]])) 27 subaxes.set_xlabel('height') 28 subaxes.set_ylabel('width') 29 30 print('Accuracy of Logistic regression classifier on training set: {:.2f}' 31 .format(clf.score(X_train, y_train))) 32 print('Accuracy of Logistic regression classifier on test set: {:.2f}' 33 .format(clf.score(X_test, y_test)))
A fruit with height 6 and width 8 is predicted to be: an apple A fruit with height 10 and width 7 is predicted to be: not an apple Accuracy of Logistic regression classifier on training set: 0.77 Accuracy of Logistic regression classifier on test set: 0.73
1 from sklearn.linear_model import LogisticRegression 2 from adspy_shared_utilities import ( 3 plot_class_regions_for_classifier_subplot) 4 5 6 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, 7 random_state = 0) 8 9 fig, subaxes = plt.subplots(1, 1, figsize=(7, 5)) 10 clf = LogisticRegression().fit(X_train, y_train) 11 title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0) 12 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, 13 None, None, title, subaxes) 14 15 print('Accuracy of Logistic regression classifier on training set: {:.2f}' 16 .format(clf.score(X_train, y_train))) 17 print('Accuracy of Logistic regression classifier on test set: {:.2f}' 18 .format(clf.score(X_test, y_test))) 19
Accuracy of Logistic regression classifier on training set: 0.80 Accuracy of Logistic regression classifier on test set: 0.80
逻辑回归正则化参数的影响
1 X_train, X_test, y_train, y_test = ( 2 train_test_split(X_fruits_2d.as_matrix(), 3 y_fruits_apple.as_matrix(), 4 random_state=0)) 5 6 fig, subaxes = plt.subplots(3, 1, figsize=(4, 10)) 7 8 for this_C, subplot in zip([0.1, 1, 100], subaxes): 9 clf = LogisticRegression(C=this_C).fit(X_train, y_train) 10 title ='Logistic regression (apple vs rest), C = {:.3f}'.format(this_C) 11 12 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, 13 X_test, y_test, title, 14 subplot) 15 plt.tight_layout()
应用于真实数据
1 from sklearn.linear_model import LogisticRegression 2 3 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0) 4 5 clf = LogisticRegression().fit(X_train, y_train) 6 print('Breast cancer dataset') 7 print('Accuracy of Logistic regression classifier on training set: {:.2f}' 8 .format(clf.score(X_train, y_train))) 9 print('Accuracy of Logistic regression classifier on test set: {:.2f}' 10 .format(clf.score(X_test, y_test)))
Breast cancer dataset Accuracy of Logistic regression classifier on training set: 0.96 Accuracy of Logistic regression classifier on test set: 0.96
SVM
线性SVM
1 from sklearn.svm import SVC 2 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot 3 4 5 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0) 6 7 fig, subaxes = plt.subplots(1, 1, figsize=(7, 5)) 8 this_C = 1.0 9 #线性核函数 10 clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train) 11 title = 'Linear SVC, C = {:.3f}'.format(this_C) 12 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)
Linear Support Vector Machine: C parameter
1 from sklearn.svm import LinearSVC 2 from adspy_shared_utilities import plot_class_regions_for_classifier 3 4 X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0) 5 fig, subaxes = plt.subplots(1, 2, figsize=(8, 4)) 6 7 for this_C, subplot in zip([0.00001, 100], subaxes): 8 clf = LinearSVC(C=this_C).fit(X_train, y_train) 9 title = 'Linear SVC, C = {:.5f}'.format(this_C) 10 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, 11 None, None, title, subplot) 12 plt.tight_layout()
线性SVM使用于真实数据
1 from sklearn.svm import LinearSVC 2 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0) 3 4 clf = LinearSVC().fit(X_train, y_train) 5 print('Breast cancer dataset') 6 print('Accuracy of Linear SVC classifier on training set: {:.2f}' 7 .format(clf.score(X_train, y_train))) 8 print('Accuracy of Linear SVC classifier on test set: {:.2f}' 9 .format(clf.score(X_test, y_test)))
Breast cancer dataset Accuracy of Linear SVC classifier on training set: 0.74 Accuracy of Linear SVC classifier on test set: 0.74
使用线性模型进行多分类任务
1 from sklearn.svm import LinearSVC 2 3 X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0) 4 5 clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train) 6 print('Coefficients:\n', clf.coef_) 7 print('Intercepts:\n', clf.intercept_)
Coefficients: [[-0.26 0.71] [-1.63 1.16] [ 0.03 0.29] [ 1.24 -1.64]] Intercepts: [-3.29 1.2 -2.72 1.16]
在水果数据集上使用多分类
1 plt.figure(figsize=(6,6)) 2 colors = ['r', 'g', 'b', 'y'] 3 cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00']) 4 5 plt.scatter(X_fruits_2d[['height']], X_fruits_2d[['width']], 6 c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7) 7 8 x_0_range = np.linspace(-10, 15) 9 10 for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']): 11 # Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b, 12 # and the decision boundary is defined as being all points with y = 0, to plot x_1 as a 13 # function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1: 14 plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8) 15 16 plt.legend(target_names_fruits) 17 plt.xlabel('height') 18 plt.ylabel('width') 19 plt.xlim(-2, 12) 20 plt.ylim(-2, 15) 21 plt.show()
核化SVM
分类模型
1 from sklearn.svm import SVC 2 from adspy_shared_utilities import plot_class_regions_for_classifier 3 4 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0) 5 6 # RBF核函数 7 plot_class_regions_for_classifier(SVC().fit(X_train, y_train), 8 X_train, y_train, None, None, 9 'Support Vector Classifier: RBF kernel') 10 11 # 多项式核函数polynomial kernel, degree = 3 12 plot_class_regions_for_classifier(SVC(kernel = 'poly', degree = 3) 13 .fit(X_train, y_train), X_train, 14 y_train, None, None, 15 'Support Vector Classifier: Polynomial kernel, degree = 3')
γ参数对RBF核函数SVM的影响
1 from adspy_shared_utilities import plot_class_regions_for_classifier 2 3 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0) 4 fig, subaxes = plt.subplots(3, 1, figsize=(4, 11)) 5 6 for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes): 7 clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train) 8 title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}'.format(this_gamma) 9 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, 10 None, None, title, subplot) 11 plt.tight_layout()
γ和C对RBP核函数SVM的影响
1 from sklearn.svm import SVC 2 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot 3 4 from sklearn.model_selection import train_test_split 5 6 7 X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0) 8 fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50) 9 10 for this_gamma, this_axis in zip([0.01, 1, 5], subaxes): 11 12 for this_C, subplot in zip([0.1, 1, 15, 250], this_axis): 13 title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C) 14 clf = SVC(kernel = 'rbf', gamma = this_gamma, 15 C = this_C).fit(X_train, y_train) 16 plot_class_regions_for_classifier_subplot(clf, X_train, y_train, 17 X_test, y_test, title, 18 subplot) 19 plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
非标准化数据应用于SVM
1 from sklearn.svm import SVC 2 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, 3 random_state = 0) 4 5 clf = SVC(C=10).fit(X_train, y_train) 6 print('Breast cancer dataset (unnormalized features)') 7 print('Accuracy of RBF-kernel SVC on training set: {:.2f}' 8 .format(clf.score(X_train, y_train))) 9 print('Accuracy of RBF-kernel SVC on test set: {:.2f}' 10 .format(clf.score(X_test, y_test)))
Breast cancer dataset (unnormalized features) Accuracy of RBF-kernel SVC on training set: 1.00 Accuracy of RBF-kernel SVC on test set: 0.63
SVM应用于标准化数据
1 from sklearn.preprocessing import MinMaxScaler 2 scaler = MinMaxScaler() 3 X_train_scaled = scaler.fit_transform(X_train) 4 X_test_scaled = scaler.transform(X_test) 5 6 clf = SVC(C=10).fit(X_train_scaled, y_train) 7 print('Breast cancer dataset (normalized with MinMax scaling)') 8 print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}' 9 .format(clf.score(X_train_scaled, y_train))) 10 print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}' 11 .format(clf.score(X_test_scaled, y_test)))
Breast cancer dataset (normalized with MinMax scaling) RBF-kernel SVC (with MinMax scaling) training set accuracy: 0.98 RBF-kernel SVC (with MinMax scaling) test set accuracy: 0.96
交叉验证
1 from sklearn.model_selection import cross_val_score 2 3 clf = KNeighborsClassifier(n_neighbors = 5) 4 X = X_fruits_2d.as_matrix() 5 y = y_fruits_2d.as_matrix() 6 #进行交叉验证 7 cv_scores = cross_val_score(clf, X, y) 8 9 print('Cross-validation scores (3-fold):', cv_scores) 10 print('Mean cross-validation score (3-fold): {:.3f}' 11 .format(np.mean(cv_scores)))
Cross-validation scores (3-fold): [ 0.77 0.74 0.83] Mean cross-validation score (3-fold): 0.781
验证曲线
1 from sklearn.svm import SVC 2 from sklearn.model_selection import validation_curve 3 4 param_range = np.logspace(-3, 3, 4) 5 train_scores, test_scores = validation_curve(SVC(), X, y, 6 param_name='gamma', 7 param_range=param_range, cv=3)
1 print(train_scores)
[[ 0.49 0.42 0.41] [ 0.84 0.72 0.76] [ 0.92 0.9 0.93] [ 1. 1. 0.98]]
1 print(test_scores)
[[ 0.45 0.32 0.33] [ 0.82 0.68 0.61] [ 0.41 0.84 0.67] [ 0.36 0.21 0.39]]
1 # This code based on scikit-learn validation_plot example 2 # See: http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html 3 plt.figure() 4 5 train_scores_mean = np.mean(train_scores, axis=1) 6 train_scores_std = np.std(train_scores, axis=1) 7 test_scores_mean = np.mean(test_scores, axis=1) 8 test_scores_std = np.std(test_scores, axis=1) 9 10 plt.title('Validation Curve with SVM') 11 plt.xlabel('$\gamma$ (gamma)') 12 plt.ylabel('Score') 13 plt.ylim(0.0, 1.1) 14 lw = 2 15 16 plt.semilogx(param_range, train_scores_mean, label='Training score', 17 color='darkorange', lw=lw) 18 19 plt.fill_between(param_range, train_scores_mean - train_scores_std, 20 train_scores_mean + train_scores_std, alpha=0.2, 21 color='darkorange', lw=lw) 22 23 plt.semilogx(param_range, test_scores_mean, label='Cross-validation score', 24 color='navy', lw=lw) 25 26 plt.fill_between(param_range, test_scores_mean - test_scores_std, 27 test_scores_mean + test_scores_std, alpha=0.2, 28 color='navy', lw=lw) 29 30 plt.legend(loc='best') 31 plt.show()
决策树
1 from sklearn.datasets import load_iris 2 from sklearn.tree import DecisionTreeClassifier 3 from adspy_shared_utilities import plot_decision_tree 4 from sklearn.model_selection import train_test_split 5 6 7 iris = load_iris() 8 9 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3) 10 clf = DecisionTreeClassifier().fit(X_train, y_train) 11 12 print('Accuracy of Decision Tree classifier on training set: {:.2f}' 13 .format(clf.score(X_train, y_train))) 14 print('Accuracy of Decision Tree classifier on test set: {:.2f}' 15 .format(clf.score(X_test, y_test)))
设置树的深度避免过拟合
#max_depth设置决策树的最大深度
1 clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train) 2 3 print('Accuracy of Decision Tree classifier on training set: {:.2f}' 4 .format(clf2.score(X_train, y_train))) 5 print('Accuracy of Decision Tree classifier on test set: {:.2f}' 6 .format(clf2.score(X_test, y_test)))
Accuracy of Decision Tree classifier on training set: 0.98 Accuracy of Decision Tree classifier on test set: 0.97
可视化决策树
1 plot_decision_tree(clf, iris.feature_names, iris.target_names)
1 #决策树的最大深度为3 2 plot_decision_tree(clf2, iris.feature_names, iris.target_names)
变量的重要性
1 from adspy_shared_utilities import plot_feature_importances 2 3 plt.figure(figsize=(10,4), dpi=80) 4 plot_feature_importances(clf, iris.feature_names) 5 plt.show() 6 7 print('Feature importances: {}'.format(clf.feature_importances_))
Feature importances: [ 0. 0.02 0.56 0.42]
1 from sklearn.tree import DecisionTreeClassifier 2 from adspy_shared_utilities import plot_class_regions_for_classifier_subplot 3 4 X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0) 5 fig, subaxes = plt.subplots(6, 1, figsize=(6, 32)) 6 7 pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]] 8 tree_max_depth = 4 9 10 for pair, axis in zip(pair_list, subaxes): 11 X = X_train[:, pair] 12 y = y_train 13 14 clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y) 15 title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth) 16 plot_class_regions_for_classifier_subplot(clf, X, y, None, 17 None, title, axis, 18 iris.target_names) 19 20 axis.set_xlabel(iris.feature_names[pair[0]]) 21 axis.set_ylabel(iris.feature_names[pair[1]]) 22 23 plt.tight_layout() 24 plt.show()
决策树应用于真实数据
1 from sklearn.tree import DecisionTreeClassifier 2 from adspy_shared_utilities import plot_decision_tree 3 from adspy_shared_utilities import plot_feature_importances 4 5 X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0) 6 7 clf = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8, 8 random_state = 0).fit(X_train, y_train) 9 10 plot_decision_tree(clf, cancer.feature_names, cancer.target_names)
1 print('Breast cancer dataset: decision tree') 2 print('Accuracy of DT classifier on training set: {:.2f}' 3 .format(clf.score(X_train, y_train))) 4 print('Accuracy of DT classifier on test set: {:.2f}' 5 .format(clf.score(X_test, y_test))) 6 7 plt.figure(figsize=(10,6),dpi=80) 8 plot_feature_importances(clf, cancer.feature_names) 9 plt.tight_layout() 10 11 plt.show()
Breast cancer dataset: decision tree Accuracy of DT classifier on training set: 0.96 Accuracy of DT classifier on test set: 0.94