机器学习 — 决策树建模
决策树
适合用来处理带有分界点的数据
优点
- 易于解释:能结合实际数据对受训模型进行合理的解释,便于理解
- 可以同时接受分类数据和数值数据作为输入
- 允许数据缺失
缺点
- 过度拟合:专门针对训练数据创建出来的分支,可能更具有特殊性。解决办法:对决策树进行剪枝
- 针对不同类型的数据集使用不同的分类算法,针对数字的、针对字符串的
- 数据量大的时候,效率较低
- 只能创建大于或者小于,不能处理复杂关系的数据
import math
my_data=[['slashdot','USA','yes',18,'None'],
['google','France','yes',23,'Premium'],
['digg','USA','yes',24,'Basic'],
['kiwitobes','France','yes',23,'Basic'],
['google','UK','no',21,'Premium'],
['(direct)','New Zealand','no',12,'None'],
['(direct)','UK','no',21,'Basic'],
['google','USA','no',24,'Premium'],
['slashdot','France','yes',19,'None'],
['digg','USA','no',18,'None'],
['google','UK','no',18,'None'],
['kiwitobes','UK','no',19,'None'],
['digg','New Zealand','yes',12,'Basic'],
['slashdot','UK','no',21,'None'],
['google','UK','yes',18,'Basic'],
['kiwitobes','France','yes',19,'Basic']]
class decisionnode:
def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
"""
Args:
col:
"""
self.col = col
self.value = value
self.results = results
self.tb = tb
self.fb = fb
def divideset(rows, col, value):
"""
根据第col列对多行数据rows进行拆分,拆分为两个数据集
Args:
rows:数据集
col:根据哪一列进行数据拆分
value:第col列的区分值(临界值)
Returns:
返回一个包含两个数据集的元组
"""
# 划分数据的依据函数
split_function = None
# 如果临界值是数字则分为大于该数字的和小于该数字的
if isinstance(value, int) or isinstance(value, float):
split_function = lambda row:row[col] >= value
else:
# 如果是字符串则分为等于该字符串的和不等于
split_function = lambda row:row[col] == value
# 将数据集拆分成两个集合,并返回
set1 = [row for row in rows if split_function(row)]
set2 = [row for row in rows if not split_function(row)]
return (set1, set2)
divideset(my_data, 2, 'yes')
([['slashdot', 'USA', 'yes', 18, 'None'],
['google', 'France', 'yes', 23, 'Premium'],
['digg', 'USA', 'yes', 24, 'Basic'],
['kiwitobes', 'France', 'yes', 23, 'Basic'],
['slashdot', 'France', 'yes', 19, 'None'],
['digg', 'New Zealand', 'yes', 12, 'Basic'],
['google', 'UK', 'yes', 18, 'Basic'],
['kiwitobes', 'France', 'yes', 19, 'Basic']],
[['google', 'UK', 'no', 21, 'Premium'],
['(direct)', 'New Zealand', 'no', 12, 'None'],
['(direct)', 'UK', 'no', 21, 'Basic'],
['google', 'USA', 'no', 24, 'Premium'],
['digg', 'USA', 'no', 18, 'None'],
['google', 'UK', 'no', 18, 'None'],
['kiwitobes', 'UK', 'no', 19, 'None'],
['slashdot', 'UK', 'no', 21, 'None']])
上面基于第二列进行数据拆分,结果并不好,因为两个数据集之间的区分度并不好,需要找出一种合适的度量方式来衡量数据集合中各种因素的混合情况。对于混杂程度的测量有几种不同的方式,包括基尼不纯度和熵等。
def uniquecounts(rows):
"""
对数据集中最后一列每一项进行计数
Args:
rows:数据集
Returns:
返回每一项及其计数的字典
"""
results = {}
for row in rows:
r = row[len(row)-1]
if r not in results:
results[r] = 0
results[r] += 1
return results
def giniimpurity(rows):
"""
基尼不纯度算法:
用来计算按某一种分类方式下,出现错误分类的概率,概率越大表示越容易出现错误,该分类方式效果越差
Returns:
返回该分类方式下的基尼不纯度
"""
total = len(rows)
counts = uniquecounts(rows)
impurity = 0
for k1 in counts:
p1 = float(counts[k1]) / total
for k2 in counts:
if k1 == k2:
continue
p2 = float(counts[k2]) / total
impurity += p1 * p2
return impurity
def entropy(rows):
"""
信息熵:
熵越大,信息越混乱,分组就是为了降低熵
"""
log2 = lambda x:math.log(x) / math.log(2)
results = uniquecounts(rows)
# 开始计算熵值
ent = 0.0
for r in results.keys():
p = float(results[r]) / len(rows)
ent = ent - p * log2(p)
return ent
print '分组前'
print '基尼不纯度: %f' % giniimpurity(my_data)
print '熵: %f' % entropy(my_data)
print '--------------'
print '分组后'
(set1, set2) = divideset(my_data, 2, 'yes')
print '基尼不纯度: %f' % giniimpurity(set1)
print '熵: %f' % entropy(set1)
分组前
基尼不纯度: 0.632812
熵: 1.505241
--------------
分组后
基尼不纯度: 0.531250
熵: 1.298795
def buildtree(rows, scorefunc=entropy):
"""
构造决策树
Args:
rows:数据集
scorefunc:计算混杂度的函数
Returns:
返回决策树
"""
if len(rows) == 0:
return decisionnode()
current_score = scorefunc(rows)
# 记录最大的信息增益
best_gain = 0.0
# 最好的分类值
best_criteria = None
# 分成最好的两个集合
best_sets = None
column_count = len(rows[0]) - 1
for col in range(0, column_count):
# 由当前列构成的不同值集合
column_values = {}
for row in rows:
column_values[row[col]] = 1
# 按照该列中每一个值分类,并计算混杂度
for value in column_values.keys():
(set1, set2) = divideset(rows, col, value)
# 计算混杂度
p = float(len(set1)) / len(rows)
# 计算信息增益
gain = current_score - p * scorefunc(set1) - (1-p) * scorefunc(set2)
if gain > best_gain and len(set1) > 0 and len(set2) > 0:
best_gain = gain
best_criteria = (col, value)
best_sets = (set1, set2)
# 上面已经找到最好的分类方式,然后递归的对分好的两个集合继续分类
if best_gain > 0:
truebranch = buildtree(best_sets[0])
falsebranch = buildtree(best_sets[1])
return decisionnode(col=best_criteria[0], value=best_criteria[1], tb=truebranch, fb=falsebranch)
else:
return decisionnode(results=uniquecounts(rows))
def printtree(tree, indent=''):
# 如果是叶子节点,直接打印结果
if tree.results != None:
print str(tree.results)
else:
# 打印条件
print '%d:%s?' % (tree.col, tree.value)
# 打印分支
print indent + 'T->',
printtree(tree.tb, indent+' ')
print indent + 'F->',
printtree(tree.fb, indent+' ')
# 获取一棵树的宽度
def getwidth(tree):
if tree.tb == None and tree.fb == None:
return 1
return getwidth(tree.tb) + getwidth(tree.fb)
# 获取一棵树的高度
def getdepth(tree):
if tree.tb == None and tree.fb == None:
return 0
return max(getdepth(tree.tb), getdepth(tree.fb)) + 1
from PIL import Image, ImageDraw
def drawtree(tree, jpeg='tree.jpg'):
w = getwidth(tree) * 100
h = getdepth(tree) * 100 + 120
img = Image.new('RGB', (w, h), (255, 255, 255))
draw = ImageDraw.Draw(img)
drawnode(draw, tree, w/2, 20)
img.save(jpeg, 'JPEG')
def drawnode(draw, tree, x, y):
"""
绘制树中的一个节点,递归绘制其子节点
Args:
draw:PIL中的ImageDraw.Draw对象
tree:
"""
if tree.results == None:
# 计算每个分支的宽度
w1 = getwidth(tree.fb) * 100
w2 = getwidth(tree.tb) * 100
# 确定此节点占据的空间
left = x - (w1 + w2) / 2
right = x + (w1 + w2) / 2
# 绘制判断条件字符串
draw.text((x-20, y-10), str(tree.col) + ':' + str(tree.value), (0, 0, 0))
# 绘制到分支的连线
draw.line((x, y, x+w1/2, y+100), fill=(255, 0, 0))
draw.line((x, y, x-w2/2, y+100), fill=(255, 0, 0))
# 绘制分支节点
drawnode(draw, tree.fb, left+w1/2, y+100)
drawnode(draw, tree.tb, right-w2/2, y+100)
else:
txt = ' \n'.join(['%s:%d' %v for v in tree.results.items()])
draw.text((x-20, y), txt, (0, 0, 0))
def classify(observation, tree):
"""
对给定的数据进行分类
Args:
observation:需要进行分类的数据
tree:已经经过训练的决策树
Returns:
返回所属的分类
"""
if tree.results != None:
return tree.results
else:
v = observation[tree.col]
branch = None
if isinstance(v, int) or isinstance(v, float):
if v > tree.value:
branch = tree.tb
else:
branch = tree.fb
else:
if v == tree.value:
branch = tree.tb
else:
branch = tree.fb
return classify(observation, branch)
def prune(tree, mingain):
"""
对决策树剪枝,mingain作为分界点
这里先递归,然后再写递归结束的条件,达到了先处理叶子节点的目的,
因为递归越深,树就越往下,达到递归结束条件的时候就是叶子节点
Args:
tree:已经构造好的决策树
mingain:是否剪除叶子节点的临界值
"""
# 如果不是叶节点,递归对其剪枝
if tree.tb.results == None:
prune(tree.tb, mingain)
if tree.fb.results == None:
prune(tree.fb, mingain)
# 如果是叶子节点,尝试合并(如果熵减少的)
if tree.tb.results != None and tree.fb.results != None:
# 构造合并后的数据集
tb,fb = [], []
for v, c in tree.tb.results.items():
tb += [[v]] * c
for v, c in tree.fb.results.items():
fb += [[v]] * c
# 检查熵减少的情况
delta = entropy(tb + fb) - (entropy(tb) + entropy(fb) / 2)
if delta < mingain:
# 满足条件,合并分支
tree.tb = None
tree.fb = None
tree.results = uniquecounts(tb + fb)
# 为了处理缺失的数据
def mdclassify(observation, tree):
"""
对给定的数据(可能包含丢失的数据Missing Data)进行分类
Args:
observation:需要进行分类的数据
tree:已经经过训练的决策树
Returns:
返回所属的分类
"""
if tree.results != None:
return tree.results
else:
v = observation[tree.col]
print v
if v == None:
# 如果对应列的值为空,取两颗子树的加权平均值作为该列的值
tr, fr = mdclassify(observation, tree.tb), mdclassify(observation, tree.fb)
print tr, fr
tcount = sum(tr.values())
fcount = sum(fr.values())
tweight = float(tcount) / (tcount + fcount)
fweight = float(fcount) / (tcount + fcount)
result = {}
for key, value in tr.items():
result[key] = value * tweight
for key, value in fr.items():
if key not in result:
result[key] = 0
result[key] += value * fweight
return result
else:
branch = None
if isinstance(v, int) or isinstance(v, float):
if v > tree.value:
branch = tree.tb
else:
branch = tree.fb
else:
if v == tree.value:
branch = tree.tb
else:
branch = tree.fb
return mdclassify(observation, branch)
# 处理数值型结果,因为字符串结果可能是一个值集,离散的,但是如果是数字的话就是连续的
def variance(rows):
"""
计算所有数据包的方差
Returns:
返回方差
"""
if len(rows) == 0:
return 0
data = [float(row[len(row) - 1]) for row in rows]
mean = sum(data) / len(data)
variance = sum([(d - mean) ** 2 for d in data]) / len(data)
return variance
tree = buildtree(my_data)
printtree(tree)
drawtree(tree)
classify(['(direct)', 'USA', 'yes', 5], tree)
0:google?
T-> 3:21?
T-> {'Premium': 3}
F-> 2:yes?
T-> {'Basic': 1}
F-> {'None': 1}
F-> 0:slashdot?
T-> {'None': 3}
F-> 2:yes?
T-> {'Basic': 4}
F-> 3:21?
T-> {'Basic': 1}
F-> {'None': 3}
{'Basic': 4}
# 剪枝后
tree = buildtree(my_data)
prune(tree, 0.1)
printtree(tree)
drawtree(tree, jpeg='prune_tree_01.jpeg')
prune(tree, 1.0)
printtree(tree)
drawtree(tree, jpeg='prune_tree_10.jpeg')
# 修复丢失数据之后的分类方法
mdclassify(['google', 'France', None, None], tree)
0:google?
T-> 3:21?
T-> {'Premium': 3}
F-> 2:yes?
T-> {'Basic': 1}
F-> {'None': 1}
F-> 0:slashdot?
T-> {'None': 3}
F-> 2:yes?
T-> {'Basic': 4}
F-> 3:21?
T-> {'Basic': 1}
F-> {'None': 3}
0:google?
T-> 3:21?
T-> {'Premium': 3}
F-> 2:yes?
T-> {'Basic': 1}
F-> {'None': 1}
F-> {'None': 6, 'Basic': 5}
google
None
None
{'Basic': 1} {'None': 1}
{'Premium': 3} {'None': 0.5, 'Basic': 0.5}
{'Basic': 0.125, 'None': 0.125, 'Premium': 2.25}