【ML-9】支持向量机--实验scitit-learn SVM
目录
- scitit-learn SVM API说明
- 鸢尾花SVM特征分类
- 鸢尾花数据不同分类器准确率比较
- 不同SVM核函数效果比较
- 异常值检测(OneClassSVM)
- 分类问题总结
一、scitit-learn SVM API说明
1.1 算法库概述分类算法
svm.SVC API说明:也可见另一篇博文:https://www.cnblogs.com/yifanrensheng/p/11863324.html
参数说明:
- C: 误差项的惩罚系数,默认为1.0;一般为大于0的一个数字,C越大表示在训练过程中对于总误差的关注度越高,也就是说当C越大的时候,对于训练集的表现会越好,但是有可能引发过度拟合的问题(overfiting)
- kernel:指定SVM内部函数的类型,可选值:linear、poly、rbf、sigmoid、precomputed(基本不用,有前提要求,要求特征属性数目和样本数目一样);默认是rbf;
- degree:当使用多项式函数作为svm内部的函数的时候,给定多项式的项数,默认为3
- gamma:当SVM内部使用poly、rbf、sigmoid的时候,核函数的系数值,当默认值为auto的时候,实际系数为1/n_features
- coef0: 当核函数为poly或者sigmoid的时候,给定的独立系数,默认为0
- probability:是否启用概率估计,默认不启动,不太建议启动
- shrinking:是否开启收缩启发式计算,默认为True
- tol: 模型构建收敛参数,当模型的的误差变化率小于该值的时候,结束模型构建过程,默认值:1e-3
- cache_size:在模型构建过程中,缓存数据的最大内存大小,默认为空,单位MB
- class_weight:给定各个类别的权重,默认为空
- max_iter:最大迭代次数,默认-1表示不限制
- decision_function_shape: 决策函数,可选值:ovo和ovr,默认为None;推荐使用ovr;(1.7以上版本才有)
1.2 scitit-learn SVM算法库概述回归算法
1.3 scitit-learn SVM-OneClassSVM
二、鸢尾花SVM特征分类
1 # Author:yifan 2 import numpy as np 3 import pandas as pd 4 import matplotlib as mpl 5 import matplotlib.pyplot as plt 6 import warnings 7 8 from sklearn import svm #svm导入 9 from sklearn.model_selection import train_test_split 10 from sklearn.metrics import accuracy_score 11 from sklearn.exceptions import ChangedBehaviorWarning 12 13 ## 设置属性防止中文乱码 14 mpl.rcParams['font.sans-serif'] = [u'SimHei'] 15 mpl.rcParams['axes.unicode_minus'] = False 16 17 warnings.filterwarnings('ignore', category=ChangedBehaviorWarning) 18 19 ## 读取数据 20 # 'sepal length', 'sepal width', 'petal length', 'petal width' 21 iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 22 path = './datas/iris.data' # 数据文件路径 23 data = pd.read_csv(path, header=None) 24 x, y = data[list(range(4))], data[4] 25 y = pd.Categorical(y).codes #把文本数据进行编码,比如a b c编码为 0 1 2; 可以通过pd.Categorical(y).categories获取index对应的原始值 26 x = x[[0, 1]] # 获取第一列和第二列 27 28 ## 数据分割 29 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, train_size=0.8) 30 ## 数据SVM分类器构建 31 clf = svm.SVC(C=1,kernel='rbf',gamma=0.1) 32 #gamma值越大,训练集的拟合就越好,但是会造成过拟合,导致测试集拟合变差 33 #gamma值越小,模型的泛化能力越好,训练集和测试集的拟合相近,但是会导致训练集出现欠拟合问题,从而,准确率变低,导致测试集准确率也变低。 34 ## 模型训练 35 #SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf', 36 #max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) 37 clf.fit(x_train, y_train) 38 39 ## 计算模型的准确率/精度 40 print (clf.score(x_train, y_train)) 41 print ('训练集准确率:', accuracy_score(y_train, clf.predict(x_train))) 42 print (clf.score(x_test, y_test)) 43 print ('测试集准确率:', accuracy_score(y_test, clf.predict(x_test))) 44 45 # 画图 46 N = 500 47 x1_min, x2_min = x.min() 48 x1_max, x2_max = x.max() 49 # print(x.max()) 50 t1 = np.linspace(x1_min, x1_max, N) 51 t2 = np.linspace(x2_min, x2_max, N) 52 x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 53 grid_show = np.dstack((x1.flat, x2.flat))[0] # 测试点 54 55 grid_hat = clf.predict(grid_show) # 预测分类值 56 grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同 57 58 cm_light = mpl.colors.ListedColormap(['#00FFCC', '#FFA0A0', '#A0A0FF']) 59 cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 60 plt.figure(facecolor='w') 61 ## 区域图 62 plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light) 63 ## 所以样本点 64 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 65 ## 测试数据集 66 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 67 ## lable列表 68 plt.xlabel(iris_feature[0], fontsize=13) 69 plt.ylabel(iris_feature[1], fontsize=13) 70 plt.xlim(x1_min, x1_max) 71 plt.ylim(x2_min, x2_max) 72 plt.title(u'鸢尾花SVM特征分类', fontsize=16) 73 plt.grid(b=True, ls=':') 74 plt.tight_layout(pad=1.5) 75 plt.show() 76
结果:
0.85
训练集准确率: 0.85
0.7333333333333333
测试集准确率: 0.7333333333333333
三、鸢尾花数据不同分类器准确率比较
1 # Author:yifan 2 3 import numpy as np 4 import pandas as pd 5 import matplotlib as mpl 6 import matplotlib.pyplot as plt 7 from sklearn.svm import SVC 8 from sklearn.model_selection import train_test_split 9 from sklearn.metrics import accuracy_score 10 from sklearn.linear_model import LogisticRegression,RidgeClassifier 11 from sklearn.neighbors import KNeighborsClassifier 12 13 ## 设置属性防止中文乱码 14 mpl.rcParams['font.sans-serif'] = [u'SimHei'] 15 mpl.rcParams['axes.unicode_minus'] = False 16 ## 读取数据 17 # 'sepal length', 'sepal width', 'petal length', 'petal width' 18 iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 19 path = './datas/iris.data' # 数据文件路径 20 data = pd.read_csv(path, header=None) 21 x, y = data[list(range(4))], data[4] 22 y = pd.Categorical(y).codes 23 x = x[[0, 1]] 24 25 ## 数据分割 26 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=28, train_size=0.6) 27 28 # 数据SVM分类器构建 29 svm = SVC(C=1, kernel='linear') 30 ## Linear分类器构建 31 lr = LogisticRegression() 32 rc = RidgeClassifier()#ridge是为了解决特征大于样本,而导致分类效果较差的情况,而提出的 33 #svm有一个重要的瓶颈——当特征数大于样本数的时候,效果变差 34 knn = KNeighborsClassifier() 35 36 ## 模型训练 37 svm.fit(x_train, y_train) 38 lr.fit(x_train, y_train) 39 rc.fit(x_train, y_train) 40 knn.fit(x_train, y_train) 41 42 ## 效果评估 43 svm_score1 = accuracy_score(y_train, svm.predict(x_train)) 44 svm_score2 = accuracy_score(y_test, svm.predict(x_test)) 45 46 lr_score1 = accuracy_score(y_train, lr.predict(x_train)) 47 lr_score2 = accuracy_score(y_test, lr.predict(x_test)) 48 49 rc_score1 = accuracy_score(y_train, rc.predict(x_train)) 50 rc_score2 = accuracy_score(y_test, rc.predict(x_test)) 51 52 knn_score1 = accuracy_score(y_train, knn.predict(x_train)) 53 knn_score2 = accuracy_score(y_test, knn.predict(x_test)) 54 55 ## 画图 56 x_tmp = [0,1,2,3] 57 y_score1 = [svm_score1, lr_score1, rc_score1, knn_score1] 58 y_score2 = [svm_score2, lr_score2, rc_score2, knn_score2] 59 60 plt.figure(facecolor='w') 61 plt.plot(x_tmp, y_score1, 'r-', lw=2, label=u'训练集准确率') 62 plt.plot(x_tmp, y_score2, 'g-', lw=2, label=u'测试集准确率') 63 plt.xlim(0, 3) 64 plt.ylim(np.min((np.min(y_score1), np.min(y_score2)))*0.9, np.max((np.max(y_score1), np.max(y_score2)))*1.1) 65 plt.legend(loc = 'lower right') 66 plt.title(u'鸢尾花数据不同分类器准确率比较', fontsize=16) 67 plt.xticks(x_tmp, [u'SVM', u'Logistic', u'Ridge', u'KNN'], rotation=0) 68 plt.grid(b=True) 69 plt.show() 70 71 72 ### 画图比较 73 N = 500 74 x1_min, x2_min = x.min() 75 x1_max, x2_max = x.max() 76 77 t1 = np.linspace(x1_min, x1_max, N) 78 t2 = np.linspace(x2_min, x2_max, N) 79 x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 80 grid_show = np.dstack((x1.flat, x2.flat))[0] # 测试点 81 82 ## 获取各个不同算法的测试值 83 svm_grid_hat = svm.predict(grid_show) 84 svm_grid_hat = svm_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 85 86 lr_grid_hat = lr.predict(grid_show) 87 lr_grid_hat = lr_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 88 89 rc_grid_hat = rc.predict(grid_show) 90 rc_grid_hat = rc_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 91 92 knn_grid_hat = knn.predict(grid_show) 93 knn_grid_hat = knn_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 94 95 ## 画图 96 cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) 97 cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 98 plt.figure(facecolor='w', figsize=(14,7)) 99 100 ### svm 区域图 101 plt.subplot(221) 102 plt.pcolormesh(x1, x2, svm_grid_hat, cmap=cm_light) 103 ## 所以样本点 104 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 105 ## 测试数据集 106 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 107 ## lable列表 108 plt.xlabel(iris_feature[0], fontsize=13) 109 plt.ylabel(iris_feature[1], fontsize=13) 110 plt.xlim(x1_min, x1_max) 111 plt.ylim(x2_min, x2_max) 112 plt.title(u'鸢尾花SVM特征分类', fontsize=16) 113 plt.grid(b=True, ls=':') 114 plt.tight_layout(pad=1.5) 115 116 plt.subplot(222) 117 ## 区域图 118 plt.pcolormesh(x1, x2, lr_grid_hat, cmap=cm_light) 119 ## 所以样本点 120 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 121 ## 测试数据集 122 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 123 ## lable列表 124 plt.xlabel(iris_feature[0], fontsize=13) 125 plt.ylabel(iris_feature[1], fontsize=13) 126 plt.xlim(x1_min, x1_max) 127 plt.ylim(x2_min, x2_max) 128 plt.title(u'鸢尾花Logistic特征分类', fontsize=16) 129 plt.grid(b=True, ls=':') 130 plt.tight_layout(pad=1.5) 131 132 plt.subplot(223) 133 ## 区域图 134 plt.pcolormesh(x1, x2, rc_grid_hat, cmap=cm_light) 135 ## 所以样本点 136 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 137 ## 测试数据集 138 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 139 ## lable列表 140 plt.xlabel(iris_feature[0], fontsize=13) 141 plt.ylabel(iris_feature[1], fontsize=13) 142 plt.xlim(x1_min, x1_max) 143 plt.ylim(x2_min, x2_max) 144 plt.title(u'鸢尾花Ridge特征分类', fontsize=16) 145 plt.grid(b=True, ls=':') 146 plt.tight_layout(pad=1.5) 147 148 plt.subplot(224) 149 ## 区域图 150 plt.pcolormesh(x1, x2, knn_grid_hat, cmap=cm_light) 151 ## 所以样本点 152 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 153 ## 测试数据集 154 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 155 ## lable列表 156 plt.xlabel(iris_feature[0], fontsize=13) 157 plt.ylabel(iris_feature[1], fontsize=13) 158 plt.xlim(x1_min, x1_max) 159 plt.ylim(x2_min, x2_max) 160 plt.title(u'鸢尾花KNN特征分类', fontsize=16) 161 plt.grid(b=True, ls=':') 162 plt.tight_layout(pad=1.5) 163 plt.show() 164
结果:
四、不同SVM核函数效果比较
1 # Author:yifan 2 import time 3 import numpy as np 4 import pandas as pd 5 import matplotlib as mpl 6 import matplotlib.pyplot as plt 7 from sklearn.svm import SVC 8 from sklearn.model_selection import train_test_split 9 from sklearn.metrics import accuracy_score 10 11 ## 设置属性防止中文乱码 12 mpl.rcParams['font.sans-serif'] = [u'SimHei'] 13 mpl.rcParams['axes.unicode_minus'] = False 14 ## 读取数据 15 # 'sepal length', 'sepal width', 'petal length', 'petal width' 16 iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度' 17 path = './datas/iris.data' # 数据文件路径 18 data = pd.read_csv(path, header=None) 19 x, y = data[list(range(4))], data[4] 20 y = pd.Categorical(y).codes 21 x = x[[0, 1]] 22 23 ## 数据分割 24 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=28, train_size=0.6) 25 26 ## 数据SVM分类器构建 27 svm1 = SVC(C=1, kernel='linear') 28 svm2 = SVC(C=1, kernel='rbf') 29 svm3 = SVC(C=1, kernel='poly') 30 svm4 = SVC(C=1, kernel='sigmoid') 31 32 ## 模型训练 33 t0=time.time() 34 svm1.fit(x_train, y_train) 35 t1=time.time() 36 svm2.fit(x_train, y_train) 37 t2=time.time() 38 svm3.fit(x_train, y_train) 39 t3=time.time() 40 svm4.fit(x_train, y_train) 41 t4=time.time() 42 43 ### 效果评估 44 svm1_score1 = accuracy_score(y_train, svm1.predict(x_train)) 45 svm1_score2 = accuracy_score(y_test, svm1.predict(x_test)) 46 47 svm2_score1 = accuracy_score(y_train, svm2.predict(x_train)) 48 svm2_score2 = accuracy_score(y_test, svm2.predict(x_test)) 49 50 svm3_score1 = accuracy_score(y_train, svm3.predict(x_train)) 51 svm3_score2 = accuracy_score(y_test, svm3.predict(x_test)) 52 53 svm4_score1 = accuracy_score(y_train, svm4.predict(x_train)) 54 svm4_score2 = accuracy_score(y_test, svm4.predict(x_test)) 55 56 ## 画图 57 x_tmp = [0,1,2,3] 58 t_score = [t1 - t0, t2-t1, t3-t2, t4-t3] 59 y_score1 = [svm1_score1, svm2_score1, svm3_score1, svm4_score1] 60 y_score2 = [svm1_score2, svm2_score2, svm3_score2, svm4_score2] 61 62 plt.figure(facecolor='w', figsize=(12,6)) 63 64 65 plt.subplot(121) 66 plt.plot(x_tmp, y_score1, 'r-', lw=2, label=u'训练集准确率') 67 plt.plot(x_tmp, y_score2, 'g-', lw=2, label=u'测试集准确率') 68 plt.xlim(-0.3, 3.3) 69 plt.ylim(np.min((np.min(y_score1), np.min(y_score2)))*0.9, np.max((np.max(y_score1), np.max(y_score2)))*1.1) 70 plt.legend(loc = 'lower left') 71 plt.title(u'模型预测准确率', fontsize=13) 72 plt.xticks(x_tmp, [u'linear-SVM', u'rbf-SVM', u'poly-SVM', u'sigmoid-SVM'], rotation=0) 73 plt.grid(b=True) 74 75 plt.subplot(122) 76 plt.plot(x_tmp, t_score, 'b-', lw=2, label=u'模型训练时间') 77 plt.title(u'模型训练耗时', fontsize=13) 78 plt.xticks(x_tmp, [u'linear-SVM', u'rbf-SVM', u'poly-SVM', u'sigmoid-SVM'], rotation=0) 79 plt.xlim(-0.3, 3.3) 80 plt.grid(b=True) 81 plt.suptitle(u'鸢尾花数据SVM分类器不同内核函数模型比较', fontsize=16) 82 83 plt.show() 84 85 86 ### 预测结果画图 87 ### 画图比较 88 N = 500 89 x1_min, x2_min = x.min() 90 x1_max, x2_max = x.max() 91 92 t1 = np.linspace(x1_min, x1_max, N) 93 t2 = np.linspace(x2_min, x2_max, N) 94 x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 95 grid_show = np.dstack((x1.flat, x2.flat))[0] # 测试点 96 97 ## 获取各个不同算法的测试值 98 svm1_grid_hat = svm1.predict(grid_show) 99 svm1_grid_hat = svm1_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 100 101 svm2_grid_hat = svm2.predict(grid_show) 102 svm2_grid_hat = svm2_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 103 104 svm3_grid_hat = svm3.predict(grid_show) 105 svm3_grid_hat = svm3_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 106 107 svm4_grid_hat = svm4.predict(grid_show) 108 svm4_grid_hat = svm4_grid_hat.reshape(x1.shape) # 使之与输入的形状相同 109 110 ## 画图 111 cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF']) 112 cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) 113 plt.figure(facecolor='w', figsize=(14,7)) 114 115 ### svm 116 plt.subplot(221) 117 ## 区域图 118 plt.pcolormesh(x1, x2, svm1_grid_hat, cmap=cm_light) 119 ## 所以样本点 120 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 121 ## 测试数据集 122 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 123 ## lable列表 124 plt.xlabel(iris_feature[0], fontsize=13) 125 plt.ylabel(iris_feature[1], fontsize=13) 126 plt.xlim(x1_min, x1_max) 127 plt.ylim(x2_min, x2_max) 128 plt.title(u'鸢尾花Linear-SVM特征分类', fontsize=16) 129 plt.grid(b=True, ls=':') 130 plt.tight_layout(pad=1.5) 131 132 plt.subplot(222) 133 ## 区域图 134 plt.pcolormesh(x1, x2, svm2_grid_hat, cmap=cm_light) 135 ## 所以样本点 136 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 137 ## 测试数据集 138 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 139 ## lable列表 140 plt.xlabel(iris_feature[0], fontsize=13) 141 plt.ylabel(iris_feature[1], fontsize=13) 142 plt.xlim(x1_min, x1_max) 143 plt.ylim(x2_min, x2_max) 144 plt.title(u'鸢尾花rbf-SVM特征分类', fontsize=16) 145 plt.grid(b=True, ls=':') 146 plt.tight_layout(pad=1.5) 147 148 plt.subplot(223) 149 ## 区域图 150 plt.pcolormesh(x1, x2, svm3_grid_hat, cmap=cm_light) 151 ## 所以样本点 152 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 153 ## 测试数据集 154 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 155 ## lable列表 156 plt.xlabel(iris_feature[0], fontsize=13) 157 plt.ylabel(iris_feature[1], fontsize=13) 158 plt.xlim(x1_min, x1_max) 159 plt.ylim(x2_min, x2_max) 160 plt.title(u'鸢尾花poly-SVM特征分类', fontsize=16) 161 plt.grid(b=True, ls=':') 162 plt.tight_layout(pad=1.5) 163 164 plt.subplot(224) 165 ## 区域图 166 plt.pcolormesh(x1, x2, svm4_grid_hat, cmap=cm_light) 167 ## 所以样本点 168 plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本 169 ## 测试数据集 170 plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10) # 圈中测试集样本 171 ## lable列表 172 plt.xlabel(iris_feature[0], fontsize=13) 173 plt.ylabel(iris_feature[1], fontsize=13) 174 plt.xlim(x1_min, x1_max) 175 plt.ylim(x2_min, x2_max) 176 plt.title(u'鸢尾花sigmoid-SVM特征分类', fontsize=16) 177 plt.grid(b=True, ls=':') 178 plt.tight_layout(pad=1.5) 179 plt.show()
结果:
五、异常值检测(OneClassSVM)
1 # Author:yifan 2 import numpy as np 3 import matplotlib.pyplot as plt 4 import matplotlib as mpl 5 import matplotlib.font_manager 6 from sklearn import svm 7 ## 设置属性防止中文乱码 8 mpl.rcParams['font.sans-serif'] = [u'SimHei'] 9 mpl.rcParams['axes.unicode_minus'] = False 10 11 # 模拟数据产生 12 xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500)) 13 # 产生训练数据 14 X = 0.3 * np.random.randn(100, 2) 15 X_train = np.r_[X + 2, X - 2] 16 # 产测试数据 17 X = 0.3 * np.random.randn(20, 2) 18 X_test = np.r_[X + 2, X - 2] 19 # 产生一些异常点数据 20 X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) 21 22 # 模型训练 23 clf = svm.OneClassSVM(nu=0.01, kernel="rbf", gamma=0.1) 24 clf.fit(X_train) 25 26 # 预测结果获取 27 y_pred_train = clf.predict(X_train) 28 y_pred_test = clf.predict(X_test) 29 y_pred_outliers = clf.predict(X_outliers) 30 # 返回1表示属于这个类别,-1表示不属于这个类别 31 n_error_train = y_pred_train[y_pred_train == -1].size 32 n_error_test = y_pred_test[y_pred_test == -1].size 33 n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size 34 35 # 获取绘图的点信息 36 Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 37 Z = Z.reshape(xx.shape) 38 39 # 画图 40 plt.figure(facecolor='w') 41 plt.title("异常点检测") 42 # 画出区域图 43 plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 9), cmap=plt.cm.PuBu) 44 a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred') 45 plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred') 46 # 画出点图 47 s = 40 48 b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s, edgecolors='k') 49 b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s, edgecolors='k') 50 c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s, edgecolors='k') 51 52 # 设置相关信息 53 plt.axis('tight') 54 plt.xlim((-5, 5)) 55 plt.ylim((-5, 5)) 56 plt.legend([a.collections[0], b1, b2, c], 57 ["分割超平面", "训练样本", "测试样本", "异常点"], 58 loc="upper left", 59 prop=matplotlib.font_manager.FontProperties(size=11)) 60 plt.xlabel("训练集错误率: %d/200 ; 测试集错误率: %d/40 ; 异常点错误率: %d/40" \ 61 % (n_error_train, n_error_test, n_error_outliers)) 62 plt.show() 63
结果:
六、分类问题总结
比较逻辑回归、KNN、决策树、随机森林、GBDT、Adaboost、SVM等分类算法的效果,数据集使用sklearn自带的模拟数据进行测试。
# Author:yifan import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl from matplotlib.colors import ListedColormap from sklearn import svm from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegressionCV ## 设置属性防止中文乱码 mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False #构造数据 X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.4, random_state=1), linearly_separable] #建模环节,用list把所有算法装起来 names = ["Nearest Neighbors", "Logistic","Decision Tree", "Random Forest", "AdaBoost", "GBDT","svm"] classifiers = [ KNeighborsClassifier(3), LogisticRegressionCV(), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(n_estimators=10,learning_rate=1.5), GradientBoostingClassifier(n_estimators=10, learning_rate=1.5), svm.SVC(C=1, kernel='rbf') ] ## 画图 figure = plt.figure(figsize=(27, 9), facecolor='w') i = 1 h = .02 # 步长 for ds in datasets: X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) cm = plt.cm.RdBu cm_bright = ListedColormap(['r', 'b', 'y']) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # 画每个算法的图 for name, clf in zip(names, classifiers): ax = plt.subplot(len(datasets), len(classifiers) + 1, i) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) # hasattr是判定某个模型中,有没有哪个参数, # 判断clf模型中,有没有decision_function # np.c_让内部数据按列合并 if hasattr(clf, "decision_function"): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title(name) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=25, horizontalalignment='right') i += 1 ## 展示图 figure.subplots_adjust(left=.02, right=.98) plt.show() # plt.savefig("cs.png")
结果: