决策树
ID3 和 ID4.5
from math import log
import operator
def createDataSet():
"""
创建测试的数据集
:return:
"""
dataSet = [
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'] ]
labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
return dataSet,labels
def calcShannonEnt(dataSet, id):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[id]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*log(prob,2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
'''
mode = '信息增益' or '信息增益率'
分别对应不同的模式
'''
def chooseBestFeatureToSplit(dataSet,mode):
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet,-1)
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob*calcShannonEnt(subDataSet,-1)
infoGain = baseEntropy - newEntropy
if mode == '信息增益率':
IV = calcShannonEnt(dataSet,i)
infoGain = infoGain/IV
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
'''
mode = '信息增益' or '信息增益率'
分别对应不同的模式
'''
def createTree(dataSet, labels,mode):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet,mode)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet\
(dataSet,bestFeat,value,),subLabels,mode)
return myTree
if __name__=='__main__':
dataSet, labels = createDataSet()
print('信息增益')
dataSet, labels = createDataSet()
print(createTree(dataSet,labels,'信息增益'))
print('信息增益率')
dataSet, labels = createDataSet()
print(createTree(dataSet,labels,'信息增益率'))
CART回归树
"""
Created on Wed Jan 6 13:54:43 2021
@author: koneko
"""
import numpy as np
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fitLine = list(map(float, curLine))
dataMat.append(fitLine)
return dataMat
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:,feature]>value)[0],:]
mat1 = dataSet[np.nonzero(dataSet[:,feature]<=value)[0],:]
return mat0,mat1
def regLeaf(dataSet):
return np.mean(dataSet[:,-1])
def regErr(dataSet):
return np.var(dataSet[:,-1])*np.shape(dataSet)[0]
def chooseBestSplit(dataSet, leafType=regLeaf,errType=regErr,ops=(1,4)):
tolS = ops[0]
tolN = ops[1]
if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m,n = np.shape(dataSet)
S = errType(dataSet)
bestS = np.inf
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):
mat0, mat1 = binSplitDataSet(dataSet,featIndex,splitVal)
if (np.shape(mat0)[0]<tolN) or (np.shape(mat1)[0]<tolN):
continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
feat, val = chooseBestSplit(dataSet,leafType,errType,ops)
if feat == None:
return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet,leafType,errType,ops)
retTree['right'] = createTree(rSet,leafType,errType,ops)
return retTree
myData = loadDataSet('ex00.txt')
myMat = np.mat(myData)
tree = createTree(myMat)
print(tree)
CART分类树
from math import log
import operator
def createDataSet():
"""
创建测试的数据集
:return:
"""
dataSet = [
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
]
labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
return dataSet,labels
def calcGini(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
Gini = 1
for key in labelCounts:
p = float(labelCounts[key])/numEntries
Gini -= p*p
return Gini
def createDataSet1():
dataSet = [['长', '粗', '男'],
['短', '粗', '男'],
['短', '粗', '男'],
['长', '细', '女'],
['短', '细', '女'],
['短', '粗', '女'],
['长', '粗', '女'],
['长', '粗', '女']]
labels = ['头发','声音']
return dataSet,labels
def createDataSet2():
"""
创造示例数据/读取数据
@param dataSet: 数据集
@return dataSet labels:数据集 特征集
"""
dataSet = [['青年', '否', '否', '一般', '不同意'],
['青年', '否', '否', '好', '不同意'],
['青年', '是', '否', '好', '同意'],
['青年', '是', '是', '一般', '同意'],
['青年', '否', '否', '一般', '不同意'],
['中年', '否', '否', '一般', '不同意'],
['中年', '否', '否', '好', '不同意'],
['中年', '是', '是', '好', '同意'],
['中年', '否', '是', '非常好', '同意'],
['中年', '否', '是', '非常好', '同意'],
['老年', '否', '是', '非常好', '同意'],
['老年', '否', '是', '好', '同意'],
['老年', '是', '否', '好', '同意'],
['老年', '是', '否', '非常好', '同意'],
['老年', '否', '否', '一般', '不同意']]
labels = ['年龄', '有工作', '有房子', '信贷情况']
return dataSet,labels
def binSplitDataSet(dataSet,index,value):
set1=[]
set2=[]
for featVec in dataSet:
reducedFeatVec = featVec[:index]
reducedFeatVec.extend(featVec[index+1:])
if featVec[index] == value:
set1.append(reducedFeatVec)
else:
set2.append(reducedFeatVec)
return set1,set2
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
nD = len(dataSet)
bestGini_feat = 100
bestFeature = -1
for feat in range(numFeatures):
featvals = [example[feat] for example in dataSet]
featvals = set(featvals)
for val in featvals:
set0,set1 = binSplitDataSet(dataSet,feat,val)
newGini_feat = (len(set0)/float(nD)) * calcGini(set0)
newGini_feat += (len(set1)/float(nD)) * calcGini(set1)
if newGini_feat < bestGini_feat:
bestGini_feat = newGini_feat
bestFeature = feat
bestVal = val
return bestFeature,bestVal
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
classList = [a[-1] for a in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat,bestVal = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
mat0,mat1 = binSplitDataSet(dataSet,bestFeat,bestVal)
left = bestVal
right = set([a[bestFeat] for a in dataSet])
right.remove(bestVal)
right = tuple(right)
print(right)
subLabels = labels[:]
myTree[bestFeatLabel][left] = createTree(mat0,subLabels)
myTree[bestFeatLabel][right] = createTree(mat1,subLabels)
return myTree
dataSet, labels = createDataSet2()
myTree = createTree(dataSet,labels)
print(myTree)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!