郑捷《机器学习算法原理与编程实践》学习笔记(第三章 决策树的发展)(二)_C4.5
(上接第三章)
3.3.1 信息增益率
信息增益率的定义如下:
GainRatio(S,A) = Gain(S,A)/SplitInfo(S,A)
其中Gain(S,A)就是ID3算法中的信息增益,而划分信息SplitInfo(S,A)代表了按照特征A划分样本集S的广度和均匀性。
其中Si到Sc是特征A的C个不同值构成的样本子集
3.3.2 C4.5的实现
#coding:utf-8 from numpy import * import math import copy import cPickle as pickle # 定义一个ID3DTree的类来封装算法: class ID3DTree(object): def __init__(self): #构造方法 self.tree = {} #生成的树 self.dataSet = [] #数据集 self.label = [] #标签集 #数据导入函数 def loadDataSet(self,path,labels): recordlist = [] fp = open(path,"rb") content = fp.read() fp.close() rowlist = content.splitlines() #按行转换为一维表 recordlist = [row.split(" ") for row in rowlist if row.strip()] self.dataSet = recordlist self.labels = labels #执行决策树函数 def train(self): labels = copy.deepcopy(self.labels) self.tree = self.buildTree(self.dataSet,labels) # 3.2.3 决策树主方法 # (1)构建决策树:创建决策树主程序 def buildTree(self,dataSet,labels): cateList = [data[-1] for data in dataSet] #抽取源数据集的决策标签列 #程序的终止条件1:如果classList只有一种决策标签,停止划分,返回这个决策标签 if cateList.count(cateList[0]) == len(cateList): return cateList[0] #程序的终止条件2:如果数据集的第一个决策标签只有一个,则返回这个决策标签 if len(dataSet[0]) == 1: return self.maxCate(cateList) #算法核心: bestFeat,featValueList = self.getBestFeat(dataSet) #返回数据集的最优特征轴 bestFeatLabel = labels[bestFeat] tree = {bestFeatLabel:{}} del(labels[bestFeat]) #抽取最优特征轴的列向量 # uniqueVals = set([data[bestFeat] for data in dataSet]) #去重 for value in featValueLis: #决策树递归生长 subLabels = labels[:] #将删除后的特征类别接建立子类别集 #按最优特征列和值分割数据集 splitDataset = self.splitDataSet(dataSet,bestFeat,value) subTree = self.buildTree(splitDataset,subLabels) tree[bestFeatLabel][value] = subTree return tree #计算出现次数最多的类别标签 def maxCate(self,catelist): items = dict([(catelist.count(i),i) for i in catelist]) return items([max(items.keys())]) #计算最优特征 def getBestFeat(self,dataSet): #计算特征向量维,其中最后一列用于类别标签,因此要减去 # numFeatures = len(dataSet[0])-1 #特征向量维数=行向量维数-1 Num_Feats = len(dataSet[0][:-1]) totality = len(dataSet) BaseEntropy = self.computeEntropy(dataSet) #基础熵:源数据香农熵 ConditionEntropy = [] #初始化条件熵 slpitInfo = [] #for C4.5,calculate gain ratoo allFeatVList = [] for f in xrange(Num_Feats): featList = [example[f] for example in dataSet] [splitI,featureValueList] = self.computeSplitInfo(featList) allFeatVList.append(featureValueList) slpitInfo.append(splitI) resultGain = 0.0 for value in featureValueList: subSet = self.splitDataSet(dataSet,f,value) appearNum = float(len(subSet)) subEntropy = self.computeEntropy(subSet) resultGain += (appearNum/totality)*subEntropy ConditionEntropy.append(resultGain) #总条件熵 infoGainArray = BaseEntropy*ones(Num_Feats)-array(ConditionEntropy) infoGainRatio = infoGainArray/array(slpitInfo) #c4.5 信息增益的计算 bastFeatureIndex = argsort(-infoGainArray)[0] return bastFeatureIndex ,allFeatVList[bastFeatureIndex] #计算信息熵 def computeEntropy(self,dataSet): #计算香农熵 datalen = float(len(dataSet)) cateList = [data[-1] for data in dataSet] #从数据集中得到类别标签 #得到类别为key,出现次数value的字典 items = dict([(i,cateList.count(i)) for i in cateList]) infoEntropy = 0.0 for key in items: #香农熵:=-p*log2(p) --infoEntropy = -prob*log(prob,2) prob = float(items[key])/datalen infoEntropy -= prob*math.log(prob,2) return infoEntropy #(5)划分数据集:分割数据集;删除特征轴所在的数据列,返回剩余的数据集 def splitDataSet(self,dataSet,axis,value): rtnList = [] for featVec in dataSet: if featVec[axis] == value: rFeatVec = featVec[:axis] #list操作:提取0~(axis-1)的元素 rFeatVec.extend(featVec[axis+1:])#lsit操作:将特征轴(列)之后的元素加回 rtnList.append(rFeatVec) return rtnList #剔除已选择的一列 #计算划分信息 def computeSplitInfo(self,featureVList): numEntries = len(featureVList) featureValueSetList = list(set(featureVList)) valueCount = [featureVList.count(featVec) for featVec in featureValueSetList] #caclulate shannonEnt pList = [float(item)/numEntries from item in valueCount] lList [item*math.log(item,2) for item in pList] splitInfo = -sum(lList) return splitInfo,featureValueSetList