机器学习理论之决策树
参考:
基础理论
统计学习方法 李航
http://blog.csdn.net/ifruoxi/article/details/53081738
http://blog.csdn.net/lsldd/article/details/41223147
决策树(ID3、C4.5、CART、随机森林)
https://www.2cto.com/kf/201605/509184.html
机器学习算法实践-树回归
写的比较好的
http://blog.csdn.net/u014688145/article/details/53326910
http://blog.csdn.net/u014688145/article/details/53212112
剪枝算法
http://blog.csdn.net/yujianmin1990/article/details/49864813
CART算法
决策树之CART(分类回归树)详解(公式比较清晰)
http://blog.csdn.net/zhihua_oba/article/details/72230427
代码实现(ID3算法)
http://blog.csdn.net/ifruoxi/article/details/53116427
https://www.cnblogs.com/MrLJC/p/4099404.html
# coding=utf-8 import operator from math import log import time #https://www.cnblogs.com/MrLJC/p/4099404.html def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfaceing', 'flippers'] return dataSet, labels # 计算香农熵 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for feaVec in dataSet: currentLabel = feaVec[-1] if currentLabel not in labelCounts: labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis + 1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1 # 因为数据集的最后一项是标签 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet) / float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if infoGain > bestInfoGain: bestInfoGain = infoGain bestFeature = i return bestFeature # 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类 # 还是没有算完,这时候就会采用多数表决的方式计算节点分类 def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 return max(classCount) def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): # 类别相同则停止划分 return classList[0] if len(dataSet[0]) == 1: # 所有特征已经用完 return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel: {}} del (labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] # 为了不改变原始列表的内容复制了一下 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree def classify(inputTree, featLabels, testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key], featLabels, testVec) else: classLabel = secondDict[key] return classLabel def main(): # data, label = createDataSet() # t1 = time.clock() # myTree = createTree(data, label) # t2 = time.clock() # print(myTree) # print('execute for ', t2 - t1) myDat, labels = createDataSet() myTree = createTree(myDat,labels) print(myTree) myDat, labels = createDataSet() print(classify(myTree, labels, [1, 0])) print(classify(myTree, labels, [1, 1])) print(classify(myTree, labels, [0, 1])) if __name__ == '__main__': main()
{'no surfaceing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
no
yes
no
代码实现(ID3&C4.5算法 http://blog.csdn.net/u014688145/article/details/53212112)
main.py
#coding=utf-8 #http://blog.csdn.net/u014688145/article/details/53212112 from math import log def createDataSet_NOID(): """ 创建数据集 """ dataSet = [[u'青年', u'否', u'否', u'一般', u'拒绝'], [u'青年', u'否', u'否', u'好', u'拒绝'], [u'青年', u'是', u'否', u'好', u'同意'], [u'青年', u'是', u'是', u'一般', u'同意'], [u'青年', u'否', u'否', u'一般', u'拒绝'], [u'中年', u'否', u'否', u'一般', u'拒绝'], [u'中年', u'否', u'否', u'好', u'拒绝'], [u'中年', u'是', u'是', u'好', u'同意'], [u'中年', u'否', u'是', u'非常好', u'同意'], [u'中年', u'否', u'是', u'非常好', u'同意'], [u'老年', u'否', u'是', u'非常好', u'同意'], [u'老年', u'否', u'是', u'好', u'同意'], [u'老年', u'是', u'否', u'好', u'同意'], [u'老年', u'是', u'否', u'非常好', u'同意'], [u'老年', u'否', u'否', u'一般', u'拒绝'], ] labels = [u'年龄', u'有工作', u'有房子', u'信贷情况'] # 返回数据集和每个维度的名称 return dataSet, labels def createDataSet(): """ 创建数据集 """ dataSet = [[u'1000',u'青年', u'否', u'否', u'一般', u'拒绝'], [u'2000',u'青年', u'否', u'否', u'好', u'拒绝'], [u'7000',u'青年', u'是', u'否', u'好', u'同意'], [u'7100',u'青年', u'是', u'是', u'一般', u'同意'], [u'3000',u'青年', u'否', u'否', u'一般', u'拒绝'], [u'3500',u'中年', u'否', u'否', u'一般', u'拒绝'], [u'3600',u'中年', u'否', u'否', u'好', u'拒绝'], [u'8000',u'中年', u'是', u'是', u'好', u'同意'], [u'9000',u'中年', u'否', u'是', u'非常好', u'同意'], [u'9200',u'中年', u'否', u'是', u'非常好', u'同意'], [u'8600',u'老年', u'否', u'是', u'非常好', u'同意'], [u'7800',u'老年', u'否', u'是', u'好', u'同意'], [u'10000',u'老年', u'是', u'否', u'好', u'同意'], [u'6500',u'老年', u'是', u'否', u'非常好', u'同意'], [u'3000',u'老年', u'否', u'否', u'一般', u'拒绝'], ] labels = [u'工资',u'年龄', u'有工作', u'有房子', u'信贷情况'] # 返回数据集和每个维度的名称 return dataSet, labels def calcInformationGainRate(dataSet, baseEntropy, i): """ 计算信息增益比 :param dataSet: 数据集 :param baseEntropy: 数据集中Y的信息熵 :param i: 特征维度i :return: 特征i对数据集的信息增益g(dataSet|X_i) """ numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[i] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries shannonEnt -= prob * log(prob, 2) return calcInformationGain(dataSet, baseEntropy, i) / shannonEnt def chooseBestFeatureToSplitByC45(dataSet): """ 选择最好的数据集划分方式 :param dataSet: :return: """ numFeatures = len(dataSet[0]) - 1 # 最后一列是分类 baseEntropy = calcShannonEnt(dataSet) bestInfoGainRate = 0.0 bestFeature = -1 for i in range(numFeatures): infoGainRate = calcInformationGainRate(dataSet, baseEntropy, i) if (infoGainRate > bestInfoGainRate): bestInfoGainRate = infoGainRate bestFeature = i return bestFeature def splitDataSet(dataSet,axis,value): """ 按照给定特征划分数据集 :param axis:划分数据集的特征的维度 :param value:特征的值 :return: 符合该特征的所有实例(并且自动移除掉这维特征) """ # 循环遍历dataSet中的每一行数据 retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reduceFeatVec = featVec[:axis] # 删除这一维特征 reduceFeatVec.extend(featVec[axis+1:]) retDataSet.append(reduceFeatVec) return retDataSet # 计算的始终是类别标签的不确定度 def calcShannonEnt(dataSet): """ 计算训练数据集中的Y随机变量的香农熵 :param dataSet: :return: """ numEntries = len(dataSet) # 实例的个数 labelCounts = {} for featVec in dataSet: # 遍历每个实例,统计标签的频次 currentLabel = featVec[-1] # 表示最后一列 # 当前标签不在labelCounts map中,就让labelCounts加入该标签 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] =0 labelCounts[currentLabel] +=1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries shannonEnt -= prob * log(prob,2) # log base 2 return shannonEnt def calcConditionalEntropy(dataSet,i,featList,uniqueVals): """ 计算x_i给定的条件下,Y的条件熵 :param dataSet: 数据集 :param i: 维度i :param featList: 数据集特征列表 :param unqiueVals: 数据集特征集合 :return: 条件熵 """ ce = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet,i,value) prob = len(subDataSet) / float(len(dataSet)) # 极大似然估计概率 ce += prob * calcShannonEnt(subDataSet) #∑pH(Y|X=xi) 条件熵的计算 return ce def calcInformationGain(dataSet,baseEntropy,i): """ 计算信息增益 :param dataSet: 数据集 :param baseEntropy: 数据集中Y的信息熵 :param i: 特征维度i :return: 特征i对数据集的信息增益g(dataSet | X_i) """ featList = [example[i] for example in dataSet] # 第i维特征列表 uniqueVals = set(featList) # 换成集合 - 集合中的每个元素不重复 newEntropy = calcConditionalEntropy(dataSet,i,featList,uniqueVals) infoGain = baseEntropy - newEntropy # 信息增益 return infoGain def chooseBestFeatureToSplitByID3(dataSet): """ 选择最好的数据集划分 :param dataSet: :return: """ numFeatures = len(dataSet[0]) -1 # 最后一列是分类 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): # 遍历所有维度特征 infoGain = calcInformationGain(dataSet,baseEntropy,i) if(infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature # 返回最佳特征对应的维度 # 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类 # 还是没有算完,这时候就会采用多数表决的方式计算节点分类 def majorityCnt(classList): classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 return max(classCount) def createTree(dataSet,labels,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByID3): """ 创建决策树 :param dataSet: 数据集 :param labels: 数据集每一维的名称 :return: 决策树 """ classList = [example[-1] for example in dataSet] # 类别列表 if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数 return classList[0] # 当类别完全相同则停止继续划分 if len(dataSet[0]) ==1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别 return majorityCnt(classList) # 返回类别标签 bestFeat = chooseBestFeatureToSplitFunc(dataSet) bestFeatLabel = labels[bestFeat] myTree ={bestFeatLabel:{}} # map 结构,且key为featureLabel del (labels[bestFeat]) # 找到需要分类的特征子集 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] # 复制操作 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels, chooseBestFeatureToSplitFunc) return myTree # # 测试决策树的构建 # dataSet,labels = createDataSet() # myTree = createTree(dataSet,labels) # print(myTree) # 测试决策树的构建 dataSet,labels = createDataSet() myTree = createTree(dataSet,labels,chooseBestFeatureToSplitByC45) from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 # 绘制决策树 import treePlotter treePlotter.createPlot(myTree)
treePlotter.py
#coding=utf-8 import matplotlib.pyplot as plt # 定义文本框和箭头格式 decisionNode = dict(boxstyle="round4", color='#3366FF') #定义判断结点形态 leafNode = dict(boxstyle="circle", color='#FF6633') #定义叶结点形态 arrow_args = dict(arrowstyle="<-", color='g') #定义箭头 #绘制带箭头的注释 def plotNode(nodeTxt, centerPt, parentPt, nodeType): createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', xytext=centerPt, textcoords='axes fraction', va="center", ha="center", bbox=nodeType, arrowprops=arrow_args) #计算叶结点数 def getNumLeafs(myTree): numLeafs = 0 firstStr = list(myTree.keys())[0] secondDict = myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': numLeafs += getNumLeafs(secondDict[key]) else: numLeafs += 1 return numLeafs #计算树的层数 def getTreeDepth(myTree): maxDepth = 0 firstStr = list(myTree.keys())[0] secondDict = myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': thisDepth = 1 + getTreeDepth(secondDict[key]) else: thisDepth = 1 if thisDepth > maxDepth: maxDepth = thisDepth return maxDepth #在父子结点间填充文本信息 def plotMidText(cntrPt, parentPt, txtString): xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0] yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1] createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30) def plotTree(myTree, parentPt, nodeTxt): numLeafs = getNumLeafs(myTree) depth = getTreeDepth(myTree) firstStr = list(myTree.keys())[0] cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff) plotMidText(cntrPt, parentPt, nodeTxt) #在父子结点间填充文本信息 plotNode(firstStr, cntrPt, parentPt, decisionNode) #绘制带箭头的注释 secondDict = myTree[firstStr] plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__ == 'dict': plotTree(secondDict[key], cntrPt, str(key)) else: plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode) plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key)) plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD def createPlot(inTree): fig = plt.figure(1, facecolor='white') fig.clf() axprops = dict(xticks=[], yticks=[]) createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) plotTree.totalW = float(getNumLeafs(inTree)) plotTree.totalD = float(getTreeDepth(inTree)) plotTree.xOff = -0.5 / plotTree.totalW; plotTree.yOff = 1.0; plotTree(inTree, (0.5, 1.0), '') plt.show()
代码实现(CART算法 分类树)
#coding=utf-8 #http://blog.csdn.net/u014688145/article/details/53212112 from math import log def createDataSet(): """ 创建数据集 """ dataSet = [[u'青年', u'否', u'否', u'一般', u'拒绝'], [u'青年', u'否', u'否', u'好', u'拒绝'], [u'青年', u'是', u'否', u'好', u'同意'], [u'青年', u'是', u'是', u'一般', u'同意'], [u'青年', u'否', u'否', u'一般', u'拒绝'], [u'中年', u'否', u'否', u'一般', u'拒绝'], [u'中年', u'否', u'否', u'好', u'拒绝'], [u'中年', u'是', u'是', u'好', u'同意'], [u'中年', u'否', u'是', u'非常好', u'同意'], [u'中年', u'否', u'是', u'非常好', u'同意'], [u'老年', u'否', u'是', u'非常好', u'同意'], [u'老年', u'否', u'是', u'好', u'同意'], [u'老年', u'是', u'否', u'好', u'同意'], [u'老年', u'是', u'否', u'非常好', u'同意'], [u'老年', u'否', u'否', u'一般', u'拒绝'], ] labels = [u'年龄', u'有工作', u'有房子', u'信贷情况'] # 返回数据集和每个维度的名称 return dataSet, labels def createDataSet_ID(): """ 创建数据集 """ dataSet = [[u'1000',u'青年', u'否', u'否', u'一般', u'拒绝'], [u'2000',u'青年', u'否', u'否', u'好', u'拒绝'], [u'7000',u'青年', u'是', u'否', u'好', u'同意'], [u'7100',u'青年', u'是', u'是', u'一般', u'同意'], [u'3000',u'青年', u'否', u'否', u'一般', u'拒绝'], [u'3500',u'中年', u'否', u'否', u'一般', u'拒绝'], [u'3600',u'中年', u'否', u'否', u'好', u'拒绝'], [u'8000',u'中年', u'是', u'是', u'好', u'同意'], [u'9000',u'中年', u'否', u'是', u'非常好', u'同意'], [u'9200',u'中年', u'否', u'是', u'非常好', u'同意'], [u'8600',u'老年', u'否', u'是', u'非常好', u'同意'], [u'7800',u'老年', u'否', u'是', u'好', u'同意'], [u'10000',u'老年', u'是', u'否', u'好', u'同意'], [u'6500',u'老年', u'是', u'否', u'非常好', u'同意'], [u'3000',u'老年', u'否', u'否', u'一般', u'拒绝'], ] labels = [u'工资',u'年龄', u'有工作', u'有房子', u'信贷情况'] # 返回数据集和每个维度的名称 return dataSet, labels # 计算数据集的基尼指数 def calcGini(dataSet): numEntries = len(dataSet) labelCounts ={} # 给所有可能分类创建字典 for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] =0 labelCounts[currentLabel]+=1 Gini =1.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries Gini -= prob * prob return Gini def splitOtherDataSetByValue(dataSet, axis, value): """ 按照给定特征划分数据集 :param axis:划分数据集的特征的维度 :param value:特征的值 :return: 不符合该特征的所有实例(并且自动移除掉这维特征) """ # 循环遍历dataSet中的每一行数据 retDataSet = [] # 找寻 axis下某个特征的非空子集 for featVec in dataSet: if featVec[axis] != value: reduceFeatVec = featVec[:axis] # 删除这一维特征 reduceFeatVec.extend(featVec[axis + 1:]) retDataSet.append(reduceFeatVec) return retDataSet def splitDataSet(dataSet,axis,value): """ 按照给定特征划分数据集 :param axis:划分数据集的特征的维度 :param value:特征的值 :return: 符合该特征的所有实例(并且自动移除掉这维特征) """ # 循环遍历dataSet中的每一行数据 retDataSet = [] # 找寻 axis下某个特征的非空子集 for featVec in dataSet: if featVec[axis] == value: reduceFeatVec = featVec[:axis] # 删除这一维特征 reduceFeatVec.extend(featVec[axis+1:]) retDataSet.append(reduceFeatVec) return retDataSet def chooseBestFeatureToSplitByCART(dataSet): numFeatures = len(dataSet[0]) -1 bestGiniIndex = 1000000.0 bestSplictValue =[] bestFeature = -1 # 计算Gini指数 for i in range(numFeatures): featList = [example[i] for example in dataSet] # 这里只针对离散变量 & 特征标签 uniqueVals = set(featList) bestGiniCut = 1000000.0 bestGiniCutValue =[] Gini_value =0.0 # 计算在该特征下每种划分的基尼指数,并且用字典记录当前特征的最佳划分点 for value in uniqueVals: # 计算subDataSet的基尼指数 subDataSet = splitDataSet(dataSet,i,value) prob = len(subDataSet) / float(len(dataSet)) Gini_value = prob * calcGini(subDataSet) # 计算otherDataSet的基尼指数 otherDataSet = splitOtherDataSetByValue(dataSet,i,value) prob = len(otherDataSet) / float(len(dataSet)) Gini_value = Gini_value + prob * calcGini(otherDataSet) # 选择最优切分点 if Gini_value < bestGiniCut: bestGiniCut = Gini_value bestGiniCutValue = value # 选择最优特征向量 GiniIndex = bestGiniCut if GiniIndex < bestGiniIndex: bestGiniIndex = GiniIndex bestSplictValue = bestGiniCutValue bestFeature = i print(bestFeature,bestSplictValue) # 若当前结点的划分结点特征中的标签超过3个,则将其以之前记录的划分点为界进行二值化处理 binaryZationDataSet(bestFeature,bestSplictValue,dataSet) return bestFeature # 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类 # 还是没有算完,这时候就会采用多数表决的方式计算节点分类 def majorityCnt(classList): """ 返回出现次数最多的分类名称 :param classList: 类列表 :retrun: 出现次数最多的类名称 """ classCount = {} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] +=1 sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True) return sortedClassCount[0][0] def binaryZationDataSet(bestFeature, bestSplitValue, dataSet): # 求特征标签数 featList = [example[bestFeature] for example in dataSet] uniqueValues = set(featList) # 特征标签输超过2,对数据集进行二值划分 为了看出决策树构造时的区别,这里特征标签为2时也进行处理 if len(uniqueValues) >= 2: for i in range(len(dataSet)): if dataSet[i][bestFeature] == bestSplitValue: # 不做处理 pass else: dataSet[i][bestFeature] = '其他' def createTree(dataSet,labels,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByCART): """ 创建决策树 :param dataSet: 数据集 :param labels: 数据集每一维的名称 :return: 决策树 """ classList = [example[-1] for example in dataSet] # 类别列表 if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数 return classList[0] # 当类别完全相同则停止继续划分 if len(dataSet[0]) ==1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别 return majorityCnt(classList) # 返回类别标签 bestFeat = chooseBestFeatureToSplitFunc(dataSet) bestFeatLabel = labels[bestFeat] myTree ={bestFeatLabel:{}} # map 结构,且key为featureLabel del (labels[bestFeat]) # 找到需要分类的特征子集 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] # 复制操作 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels, chooseBestFeatureToSplitFunc) return myTree # # 测试决策树的构建 # dataSet,labels = createDataSet() # myTree = createTree(dataSet,labels) # print(myTree) # 测试决策树的构建 dataSet,labels = createDataSet() myTree = createTree(dataSet,labels,chooseBestFeatureToSplitByCART) from pylab import * mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 # 绘制决策树 import treePlotter treePlotter.createPlot(myTree)