决策树
什么是决策树
- 非参数学习方法
- 可以解决分类问题
- 天然可以解决多分类问题
- 也可以解决回归问题
- 非常好的可解释性
复杂度
预测:O(logm)
训练: O(n*m*logm)
剪枝:降低复杂度,解决过拟合
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets def plot_decision_boundary(model,axis): """决策边界""" x0, x1 = np.meshgrid( np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1), np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100)).reshape(-1,1) # np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100).reshape(-1,1)) ) X_new = np.c_[x0.ravel(),x1.ravel()] y_predict = model.predict(X_new) zz = y_predict.reshape(x0.shape) from matplotlib.colors import ListedColormap custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9']) plt.contourf(x0,x1,zz,cmap=custom_cmap)
熵计算和基尼系数的异同
- 熵计算比基尼系数稍慢
- scikit-learn中默认为基尼系数
- 大多数情况二者没有特别的优劣
信息熵(随机变量不确定度的度量)
- 熵越大,数据的不确定性越高
- 熵越小,数据的分类越确定
def entropy(p): """二分类交叉熵""" return -p*np.log(p) - (1-p)*np.log(1-p) x = np.linspace(0.01,0.99,200) plt.plot(x,entropy(x)) plt.show()
iris = datasets.load_iris() x = iris.data[:,2:] y = iris.target plt.scatter(x[y==0,0],x[y==0,1]) plt.scatter(x[y==1,0],x[y==1,1]) plt.scatter(x[y==2,0],x[y==2,1]) plt.show()
from sklearn.tree import DecisionTreeClassifier dc = DecisionTreeClassifier(max_depth=2,criterion="entropy") dc.fit(x,y) dc.score(x,y) >>>0.96 plot_decision_boundary(dc,axis=[0.5,7.5,0,3]) plt.scatter(x[y==0,0],x[y==0,1]) plt.scatter(x[y==1,0],x[y==1,1]) plt.scatter(x[y==2,0],x[y==2,1]) plt.show()
基尼系数
def G(p): return 2*p-2*p**2 x = np.linspace(0.01,0.99,200) plt.plot(x,G(x)) plt.show()
iris = datasets.load_iris() x = iris.data[:,2:] y = iris.target plt.scatter(x[y==0,0],x[y==0,1]) plt.scatter(x[y==1,0],x[y==1,1]) plt.scatter(x[y==2,0],x[y==2,1]) plt.show()
iris = datasets.load_iris() x = iris.data[:,2:] y = iris.target plt.scatter(x[y==0,0],x[y==0,1]) plt.scatter(x[y==1,0],x[y==1,1]) plt.scatter(x[y==2,0],x[y==2,1]) plt.show()
模拟使用信息熵划分
模拟使用基尼系数划分
超参数调优
iris = datasets.load_iris() x = iris.data[:,2:] y = iris.target from sklearn.tree import DecisionTreeClassifier dc = DecisionTreeClassifier(max_depth=3,criterion="gini") dc.fit(x,y) dc.score(x,y) plot_decision_boundary(dc,axis=[0.5,7.5,0,3]) plt.scatter(x[y==0,0],x[y==0,1]) plt.scatter(x[y==1,0],x[y==1,1]) plt.scatter(x[y==2,0],x[y==2,1]) plt.show()
dc = DecisionTreeClassifier(min_samples_leaf=5,criterion="gini") dc.fit(x,y) dc.score(x,y) plot_decision_boundary(dc,axis=[0.5,7.5,0,3]) plt.scatter(x[y==0,0],x[y==0,1]) plt.scatter(x[y==1,0],x[y==1,1]) plt.scatter(x[y==2,0],x[y==2,1]) plt.show()
决策树解决回归问题
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split boston = datasets.load_boston() x = boston.data y = boston.target trainX,testX,trainY,testY = train_test_split(x,y) from sklearn.tree import DecisionTreeRegressor dt_reg = DecisionTreeRegressor(max_depth=3,min_samples_leaf=3) dt_reg.fit(trainX,trainY) print("train:",dt_reg.score(trainX, trainY)) print("test:",dt_reg.score(testX,testY)) >>>train: 0.8340090210169837 >>>test: 0.7389146453386009 def MyDecisionTreeRegressor(trainX,trainY,testX,testY,deep=3,min_samples_leaf=3): dt_reg = DecisionTreeRegressor(max_depth=deep,min_samples_leaf=min_samples_leaf) dt_reg.fit(trainX,trainY) return dt_reg.score(trainX,trainY),dt_reg.score(testX,testY) # 网格搜索 best_score = np.zeros((10,20)) for i in range(1,10): for j in range(1,20): train,test = MyDecisionTreeRegressor(trainX,trainY,testX,testY,deep=i,min_samples_leaf=j) best_score[i,j] = test print(np.max(best_score)) row,column = np.where(best_score==np.max(best_score)) print("row:",row[0]) print("column:",column[0]) >>>0.7707301283273235 >>>row: 4 >>>column: 3
决策树的局限性
- 决策边界横平竖直
- 对于个别样本点非常敏感