机器学习理论之决策树

参考:

基础理论

统计学习方法 李航

http://blog.csdn.net/ifruoxi/article/details/53081738

http://blog.csdn.net/lsldd/article/details/41223147

决策树(ID3、C4.5、CART、随机森林)

https://www.2cto.com/kf/201605/509184.html  

机器学习算法实践-树回归

 机器学习实战笔记(Python实现)-09-树回归

写的比较好的

http://blog.csdn.net/u014688145/article/details/53326910

http://blog.csdn.net/u014688145/article/details/53212112

剪枝算法

http://blog.csdn.net/yujianmin1990/article/details/49864813

CART算法

 决策树之CART(分类回归树)详解(公式比较清晰)

http://blog.csdn.net/zhihua_oba/article/details/72230427  

 

 

 

代码实现(ID3算法)

http://blog.csdn.net/ifruoxi/article/details/53116427

https://www.cnblogs.com/MrLJC/p/4099404.html

# coding=utf-8
import operator
from math import log
import time

#https://www.cnblogs.com/MrLJC/p/4099404.html
def createDataSet():
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfaceing', 'flippers']
    return dataSet, labels


# 计算香农熵
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for feaVec in dataSet:
        currentLabel = feaVec[-1]
        if currentLabel not in labelCounts:
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt


def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1  # 因为数据集的最后一项是标签
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature


# 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
# 还是没有算完,这时候就会采用多数表决的方式计算节点分类
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    return max(classCount)


def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList):  # 类别相同则停止划分
        return classList[0]
    if len(dataSet[0]) == 1:  # 所有特征已经用完
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]  # 为了不改变原始列表的内容复制了一下
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,
                                                               bestFeat, value), subLabels)
    return myTree


def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel


def main():
    # data, label = createDataSet()
    # t1 = time.clock()
    # myTree = createTree(data, label)
    # t2 = time.clock()
    # print(myTree)
    # print('execute for ', t2 - t1)

    myDat, labels = createDataSet()
    myTree = createTree(myDat,labels)
    print(myTree)
    myDat, labels = createDataSet()
    print(classify(myTree, labels, [1, 0]))
    print(classify(myTree, labels, [1, 1]))
    print(classify(myTree, labels, [0, 1]))

if __name__ == '__main__':
    main()
View Code

{'no surfaceing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
no
yes
no

 代码实现(ID3&C4.5算法 http://blog.csdn.net/u014688145/article/details/53212112)

main.py

#coding=utf-8
#http://blog.csdn.net/u014688145/article/details/53212112
from math import log

def createDataSet_NOID():
    """
    创建数据集
    """
    dataSet = [[u'青年', u'', u'', u'一般', u'拒绝'],
                [u'青年', u'', u'', u'', u'拒绝'],
                [u'青年', u'', u'', u'', u'同意'],
                [u'青年', u'', u'', u'一般', u'同意'],
                [u'青年', u'', u'', u'一般', u'拒绝'],
                [u'中年', u'', u'', u'一般', u'拒绝'],
                [u'中年', u'', u'', u'', u'拒绝'],
                [u'中年', u'', u'', u'', u'同意'],
                [u'中年', u'', u'', u'非常好', u'同意'],
                [u'中年', u'', u'', u'非常好', u'同意'],
                [u'老年', u'', u'', u'非常好', u'同意'],
                [u'老年', u'', u'', u'', u'同意'],
                [u'老年', u'', u'', u'', u'同意'],
                [u'老年', u'', u'', u'非常好', u'同意'],
                [u'老年', u'', u'', u'一般', u'拒绝'],
                ]
    labels = [u'年龄', u'有工作', u'有房子', u'信贷情况']
    # 返回数据集和每个维度的名称
    return dataSet, labels

def createDataSet():
    """
    创建数据集
    """
    dataSet = [[u'1000',u'青年', u'', u'', u'一般', u'拒绝'],
                [u'2000',u'青年', u'', u'', u'', u'拒绝'],
                [u'7000',u'青年', u'', u'', u'', u'同意'],
                [u'7100',u'青年', u'', u'', u'一般', u'同意'],
                [u'3000',u'青年', u'', u'', u'一般', u'拒绝'],
                [u'3500',u'中年', u'', u'', u'一般', u'拒绝'],
                [u'3600',u'中年', u'', u'', u'', u'拒绝'],
                [u'8000',u'中年', u'', u'', u'', u'同意'],
                [u'9000',u'中年', u'', u'', u'非常好', u'同意'],
                [u'9200',u'中年', u'', u'', u'非常好', u'同意'],
                [u'8600',u'老年', u'', u'', u'非常好', u'同意'],
                [u'7800',u'老年', u'', u'', u'', u'同意'],
                [u'10000',u'老年', u'', u'', u'', u'同意'],
                [u'6500',u'老年', u'', u'', u'非常好', u'同意'],
                [u'3000',u'老年', u'', u'', u'一般', u'拒绝'],
                ]
    labels = [u'工资',u'年龄', u'有工作', u'有房子', u'信贷情况']
    # 返回数据集和每个维度的名称
    return dataSet, labels

def calcInformationGainRate(dataSet, baseEntropy, i):
    """
    计算信息增益比
    :param dataSet: 数据集
    :param baseEntropy: 数据集中Y的信息熵
    :param i: 特征维度i
    :return: 特征i对数据集的信息增益g(dataSet|X_i)
    """
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[i]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)

    return calcInformationGain(dataSet, baseEntropy, i) / shannonEnt


def chooseBestFeatureToSplitByC45(dataSet):
    """
    选择最好的数据集划分方式
    :param dataSet:
    :return:
    """
    numFeatures = len(dataSet[0]) - 1  # 最后一列是分类
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGainRate = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        infoGainRate = calcInformationGainRate(dataSet, baseEntropy, i)
        if (infoGainRate > bestInfoGainRate):
            bestInfoGainRate = infoGainRate
            bestFeature = i
    return bestFeature

def splitDataSet(dataSet,axis,value):
    """
    按照给定特征划分数据集
    :param axis:划分数据集的特征的维度
    :param value:特征的值
    :return: 符合该特征的所有实例(并且自动移除掉这维特征)
    """

    # 循环遍历dataSet中的每一行数据
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[:axis] # 删除这一维特征
            reduceFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet

# 计算的始终是类别标签的不确定度
def calcShannonEnt(dataSet):
    """
    计算训练数据集中的Y随机变量的香农熵
    :param dataSet:
    :return:
    """
    numEntries = len(dataSet) # 实例的个数
    labelCounts = {}
    for featVec in dataSet: # 遍历每个实例,统计标签的频次
        currentLabel = featVec[-1] # 表示最后一列
        # 当前标签不在labelCounts map中,就让labelCounts加入该标签
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] =0
        labelCounts[currentLabel] +=1

    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob,2) # log base 2
    return shannonEnt

def calcConditionalEntropy(dataSet,i,featList,uniqueVals):
    """
    计算x_i给定的条件下,Y的条件熵
    :param dataSet: 数据集
    :param i: 维度i
    :param featList: 数据集特征列表
    :param unqiueVals: 数据集特征集合
    :return: 条件熵
    """
    ce = 0.0
    for value in uniqueVals:
        subDataSet = splitDataSet(dataSet,i,value)
        prob = len(subDataSet) / float(len(dataSet)) # 极大似然估计概率
        ce += prob * calcShannonEnt(subDataSet) #∑pH(Y|X=xi) 条件熵的计算
    return ce

def calcInformationGain(dataSet,baseEntropy,i):
    """
    计算信息增益
    :param dataSet: 数据集
    :param baseEntropy: 数据集中Y的信息熵
    :param i: 特征维度i
    :return: 特征i对数据集的信息增益g(dataSet | X_i)
    """
    featList = [example[i] for example in dataSet] # 第i维特征列表
    uniqueVals = set(featList) # 换成集合 - 集合中的每个元素不重复
    newEntropy = calcConditionalEntropy(dataSet,i,featList,uniqueVals)
    infoGain = baseEntropy - newEntropy # 信息增益
    return infoGain

def chooseBestFeatureToSplitByID3(dataSet):
    """
    选择最好的数据集划分
    :param dataSet:
    :return:
    """
    numFeatures = len(dataSet[0]) -1 # 最后一列是分类
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures): # 遍历所有维度特征
        infoGain = calcInformationGain(dataSet,baseEntropy,i)
        if(infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature # 返回最佳特征对应的维度

# 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
# 还是没有算完,这时候就会采用多数表决的方式计算节点分类
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    return max(classCount)

def createTree(dataSet,labels,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByID3):
    """
    创建决策树
    :param dataSet: 数据集
    :param labels: 数据集每一维的名称
    :return: 决策树
    """
    classList = [example[-1] for example in dataSet] # 类别列表
    if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数
        return classList[0] # 当类别完全相同则停止继续划分
    if len(dataSet[0]) ==1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别
        return majorityCnt(classList) # 返回类别标签
    bestFeat = chooseBestFeatureToSplitFunc(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree ={bestFeatLabel:{}}  # map 结构,且key为featureLabel
    del (labels[bestFeat])
    # 找到需要分类的特征子集
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:] # 复制操作
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels, chooseBestFeatureToSplitFunc)
    return myTree


# # 测试决策树的构建
# dataSet,labels = createDataSet()
# myTree = createTree(dataSet,labels)
# print(myTree)

# 测试决策树的构建
dataSet,labels = createDataSet()
myTree = createTree(dataSet,labels,chooseBestFeatureToSplitByC45)

from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题

# 绘制决策树
import treePlotter
treePlotter.createPlot(myTree)
View Code
treePlotter.py
#coding=utf-8
import matplotlib.pyplot as plt

# 定义文本框和箭头格式
decisionNode = dict(boxstyle="round4", color='#3366FF')  #定义判断结点形态
leafNode = dict(boxstyle="circle", color='#FF6633')  #定义叶结点形态
arrow_args = dict(arrowstyle="<-", color='g')  #定义箭头

#绘制带箭头的注释
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
                            xytext=centerPt, textcoords='axes fraction',
                            va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)


#计算叶结点数
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs


#计算树的层数
def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth


#在父子结点间填充文本信息
def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)


def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)  #在父子结点间填充文本信息
    plotNode(firstStr, cntrPt, parentPt, decisionNode)  #绘制带箭头的注释
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], cntrPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD


def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalW;
    plotTree.yOff = 1.0;
    plotTree(inTree, (0.5, 1.0), '')
    plt.show()
View Code

 

代码实现(CART算法 分类树)

#coding=utf-8
#http://blog.csdn.net/u014688145/article/details/53212112
from math import log

def createDataSet():
    """
    创建数据集
    """
    dataSet = [[u'青年', u'', u'', u'一般', u'拒绝'],
                [u'青年', u'', u'', u'', u'拒绝'],
                [u'青年', u'', u'', u'', u'同意'],
                [u'青年', u'', u'', u'一般', u'同意'],
                [u'青年', u'', u'', u'一般', u'拒绝'],
                [u'中年', u'', u'', u'一般', u'拒绝'],
                [u'中年', u'', u'', u'', u'拒绝'],
                [u'中年', u'', u'', u'', u'同意'],
                [u'中年', u'', u'', u'非常好', u'同意'],
                [u'中年', u'', u'', u'非常好', u'同意'],
                [u'老年', u'', u'', u'非常好', u'同意'],
                [u'老年', u'', u'', u'', u'同意'],
                [u'老年', u'', u'', u'', u'同意'],
                [u'老年', u'', u'', u'非常好', u'同意'],
                [u'老年', u'', u'', u'一般', u'拒绝'],
                ]
    labels = [u'年龄', u'有工作', u'有房子', u'信贷情况']
    # 返回数据集和每个维度的名称
    return dataSet, labels

def createDataSet_ID():
    """
    创建数据集
    """
    dataSet = [[u'1000',u'青年', u'', u'', u'一般', u'拒绝'],
                [u'2000',u'青年', u'', u'', u'', u'拒绝'],
                [u'7000',u'青年', u'', u'', u'', u'同意'],
                [u'7100',u'青年', u'', u'', u'一般', u'同意'],
                [u'3000',u'青年', u'', u'', u'一般', u'拒绝'],
                [u'3500',u'中年', u'', u'', u'一般', u'拒绝'],
                [u'3600',u'中年', u'', u'', u'', u'拒绝'],
                [u'8000',u'中年', u'', u'', u'', u'同意'],
                [u'9000',u'中年', u'', u'', u'非常好', u'同意'],
                [u'9200',u'中年', u'', u'', u'非常好', u'同意'],
                [u'8600',u'老年', u'', u'', u'非常好', u'同意'],
                [u'7800',u'老年', u'', u'', u'', u'同意'],
                [u'10000',u'老年', u'', u'', u'', u'同意'],
                [u'6500',u'老年', u'', u'', u'非常好', u'同意'],
                [u'3000',u'老年', u'', u'', u'一般', u'拒绝'],
                ]
    labels = [u'工资',u'年龄', u'有工作', u'有房子', u'信贷情况']
    # 返回数据集和每个维度的名称
    return dataSet, labels

# 计算数据集的基尼指数
def calcGini(dataSet):
    numEntries = len(dataSet)
    labelCounts ={}
    # 给所有可能分类创建字典
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] =0
        labelCounts[currentLabel]+=1
    Gini =1.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        Gini -= prob * prob
    return Gini


def splitOtherDataSetByValue(dataSet, axis, value):
    """
按照给定特征划分数据集
:param axis:划分数据集的特征的维度
:param value:特征的值
:return: 不符合该特征的所有实例(并且自动移除掉这维特征)
"""
    # 循环遍历dataSet中的每一行数据
    retDataSet = []
    # 找寻 axis下某个特征的非空子集
    for featVec in dataSet:
        if featVec[axis] != value:
            reduceFeatVec = featVec[:axis]  # 删除这一维特征
            reduceFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet

def splitDataSet(dataSet,axis,value):
    """
    按照给定特征划分数据集
    :param axis:划分数据集的特征的维度
    :param value:特征的值
    :return: 符合该特征的所有实例(并且自动移除掉这维特征)
    """

    # 循环遍历dataSet中的每一行数据
    retDataSet = []
    # 找寻 axis下某个特征的非空子集
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[:axis] # 删除这一维特征
            reduceFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet

def chooseBestFeatureToSplitByCART(dataSet):
    numFeatures = len(dataSet[0]) -1
    bestGiniIndex = 1000000.0
    bestSplictValue =[]
    bestFeature = -1
    # 计算Gini指数
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        # 这里只针对离散变量 & 特征标签
        uniqueVals = set(featList)
        bestGiniCut = 1000000.0
        bestGiniCutValue =[]
        Gini_value =0.0
        # 计算在该特征下每种划分的基尼指数,并且用字典记录当前特征的最佳划分点
        for value in uniqueVals:
            # 计算subDataSet的基尼指数
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet) / float(len(dataSet))
            Gini_value = prob * calcGini(subDataSet)
            # 计算otherDataSet的基尼指数
            otherDataSet = splitOtherDataSetByValue(dataSet,i,value)
            prob = len(otherDataSet) / float(len(dataSet))
            Gini_value = Gini_value + prob * calcGini(otherDataSet)
            # 选择最优切分点
            if Gini_value < bestGiniCut:
                bestGiniCut = Gini_value
                bestGiniCutValue = value

        # 选择最优特征向量
        GiniIndex = bestGiniCut
        if GiniIndex < bestGiniIndex:
            bestGiniIndex = GiniIndex
            bestSplictValue = bestGiniCutValue
            bestFeature = i
            print(bestFeature,bestSplictValue)

    # 若当前结点的划分结点特征中的标签超过3个,则将其以之前记录的划分点为界进行二值化处理
    binaryZationDataSet(bestFeature,bestSplictValue,dataSet)
    return bestFeature

# 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
# 还是没有算完,这时候就会采用多数表决的方式计算节点分类
def majorityCnt(classList):
    """
    返回出现次数最多的分类名称
    :param classList: 类列表
    :retrun: 出现次数最多的类名称
    """

    classCount = {}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] +=1
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

def binaryZationDataSet(bestFeature, bestSplitValue, dataSet):
    # 求特征标签数
    featList = [example[bestFeature] for example in dataSet]
    uniqueValues = set(featList)

    # 特征标签输超过2,对数据集进行二值划分 为了看出决策树构造时的区别,这里特征标签为2时也进行处理
    if len(uniqueValues) >= 2:
        for i in range(len(dataSet)):
            if dataSet[i][bestFeature] == bestSplitValue:  # 不做处理
                pass
            else:
                dataSet[i][bestFeature] = '其他'

def createTree(dataSet,labels,chooseBestFeatureToSplitFunc = chooseBestFeatureToSplitByCART):
    """
    创建决策树
    :param dataSet: 数据集
    :param labels: 数据集每一维的名称
    :return: 决策树
    """
    classList = [example[-1] for example in dataSet] # 类别列表
    if classList.count(classList[0]) == len(classList): # 统计属于列别classList[0]的个数
        return classList[0] # 当类别完全相同则停止继续划分
    if len(dataSet[0]) ==1: # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别
        return majorityCnt(classList) # 返回类别标签
    bestFeat = chooseBestFeatureToSplitFunc(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree ={bestFeatLabel:{}}  # map 结构,且key为featureLabel
    del (labels[bestFeat])
    # 找到需要分类的特征子集
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:] # 复制操作
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels, chooseBestFeatureToSplitFunc)
    return myTree


# # 测试决策树的构建
# dataSet,labels = createDataSet()
# myTree = createTree(dataSet,labels)
# print(myTree)

# 测试决策树的构建
dataSet,labels = createDataSet()
myTree = createTree(dataSet,labels,chooseBestFeatureToSplitByCART)

from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题

# 绘制决策树
import treePlotter
treePlotter.createPlot(myTree)
View Code

 

posted on 2018-01-20 14:02  Maddock  阅读(361)  评论(0编辑  收藏  举报

导航