重写轮子之 ID3

这是半成品, 已完成了 fit() 部分, 形成了包含一棵完整树的 node 对象.
后续工作是需解析该 node 对象, 完成 predict() 工作.
# !/usr/bin/python
# -*- coding:utf-8 -*-


"""
Re-implement ID3 algorithm as a practice
Only information gain criterion supplied in our DT algorithm.
使用该 ID3 re-implement 的前提：
    1. train data　的标签必须转成0,1,2,...的形式
    2. 只能处理连续特征
"""

# Author: 相忠良(Zhong-Liang Xiang) <ugoood@163.com>
# Finished at July ***, 2017

import numpy as np
from sklearn import datasets, cross_validation


## load data
def load_data():
    iris = datasets.load_iris()
    return cross_validation.train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)


class DecisionNode():
    def __init__(self, feature_i=None, threshold=None, value=None, left_branch=None, right_branch=None):
        self.feature_i = feature_i          # Best feature's index
        self.threshold = threshold          # Best split threshold in the feature
        self.value = value                  # Value if the node is a leaf in the tree
        self.left_branch = left_branch      # 'Left' subtree
        self.right_branch = right_branch    # 'Right' subtree
        # print feature_i, 'feature_i'
        print self.value, 'value'


class MyDecisionTreeClassifier():
    trees = []
    num_eles_in_class_label = 3  # 分类标签类的个数
    tree = {}
    predict_label = []
    X_train = []
    y_train = []
    max_depth = 3
    max_leaf_nodes = 30
    min_samples_leaf = 1
    count = 0

    def __init__(self, ):
        self.root = None

    # TODO
    def fit(self, X, y):
        self.root = DecisionNode(self.createTree(X, y))

    def predict(self, X):
        pass

    def score(self, X, y):
        pass

    ## entropy
    # e.g entropy(y_test)
    def __entropy(self, label_list):
        bincount = np.bincount(label_list, minlength=self.num_eles_in_class_label)
        sum = np.sum(bincount)
        # print 'sum in entropy ', sum
        temp = 1.0 * bincount / sum
        tot = 0

        # to avoid log2(0)
        for e in temp:
            if (e != 0):
                tot += e * (-np.log2(e))
        return tot

    def gain(self, pre_split_label_list, after_split_label_list_2d):
        total = 0
        n = after_split_label_list_2d[0].__len__() + after_split_label_list_2d[1].__len__()
        for item in after_split_label_list_2d:
            total += self.__entropy(item) * (1.0 * item.__len__() / n)
        return self.__entropy(pre_split_label_list) - total

    ##  针对np.bincount()的结果，如[37 34 41]，判断是否为纯节点，既[0 22 0]的形式
    def isPure(self, bincount_list):
        sb = sorted(bincount_list)
        if ((sb[-1] != 0) & (sb[-2] == 0)):
            return True
        else:
            return False

    ## 计算出现次数最多的类别标签
    def maxCate(self, bincount_list):
        bincount_list = np.array(bincount_list)
        return bincount_list.argmax()

    ## 递归停止条件：
    #  如果样例小于等于１０，停止
    #  如果样例大于１０　且　点纯，停止
    #  否则　继续分裂
    def createTree(self, X, y):
        bincount_list = np.bincount(y, minlength=self.num_eles_in_class_label)
        if ((self.isPure(bincount_list)) & (np.sum(bincount_list) > 10)):
            print bincount_list, '11111'
            return DecisionNode(value=self.maxCate(bincount_list))
        elif (np.sum(bincount_list) <= 10):
            print bincount_list, '22222'
            return DecisionNode(value=self.maxCate(bincount_list))
        else:
            print bincount_list, '33333'

            f, v, g = self.seek_best_split_feature(X, y)
            mask_big = X[:, f] > v
            mask_sma = X[:, f] <= v
            bigger_X = []
            bigger_y = []
            smaller_X = []
            smaller_y = []
            bigger_X.append(X[mask_big])
            bigger_y.append(y[mask_big])
            smaller_X.append(X[mask_sma])
            smaller_y.append(y[mask_sma])

            left_branch = self.createTree(bigger_X[0], bigger_y[0])
            right_branch = self.createTree(smaller_X[0], smaller_y[0])
            return DecisionNode(feature_i=f, threshold=v, left_branch=left_branch, right_branch=right_branch)

    ## k>=2 特征区间切分点个数
    #  samples 样本
    #  labels 样本对应的标签
    #  return: best_feature, best_split_point, gain_on_that_point
    def seek_best_split_feature(self, samples, labels, k=10):  # 2 2.84 0.915290847812
        samples = np.array(samples)
        labels = np.array(labels)
        best_split_point_pool = {}  # 最佳分裂特征，点，及对应的gain
        col_indx = 0

        #  遍历所有特征，寻找某特征最佳分裂点
        while col_indx < samples.shape[1]:
            max = np.max(samples[:, col_indx])
            min = np.min(samples[:, col_indx])
            split_point = np.linspace(min, max, k, False)[1:]
            #  寻找某特征最佳分裂点
            temp = []
            dic = {}
            for p in split_point:
                index_less = np.where(samples[:, col_indx] < p)[0]  # [1 2]
                index_bigger = np.where(samples[:, col_indx] >= p)[0]
                label_less = labels[index_less]
                label_bigger = labels[index_bigger]
                temp.append(list(label_less))
                temp.append(list(label_bigger))
                g = self.gain(labels, temp)
                dic[p] = g
                temp = []
            best_key = sorted(dic, key=lambda x: dic[x])[-1]  # 返回value最大的那个key
            dic_temp = {}
            dic_temp[best_key] = dic[best_key]
            best_split_point_pool[col_indx] = dic_temp
            col_indx += 1

        # 特征列表
        feature_name_box = list(best_split_point_pool.keys())
        b = list(best_split_point_pool.values())  # 临时表
        #  最大gain列表
        gain_box = []
        #  最佳切分点列表
        point_box = []
        for item in b:
            gain_box.append(item.values()[0])
            point_box.append(item.keys()[0])

        best_feature = feature_name_box[np.argmax(gain_box)]
        best_split_point = point_box[np.argmax(gain_box)]
        gain_on_that_point = np.max(gain_box)
        return best_feature, best_split_point, gain_on_that_point


## 测试用例

X_train, X_test, y_train, y_test = load_data()
cls = MyDecisionTreeClassifier()

a = [[9, 2, 3, 4],
     [5, 6, 7, 8],
     [1, 10, 11, 12],
     [13, 14, 15, 16]]
b = [0, 1, 2, 3]
a = np.array(a)
b = np.array(b)

# xx = [2,1,1]
# print cls.maxCate(xx),'11111111111111111111111'

cls.fit(X_train, y_train)
tree = cls.root
print type(cls.root)

'''
下面是编程过程中留下的经验
'''

# 重要1: np.linspace(0,1,5) 0-1之间，等分５份，包括首尾
# np.linspace(0,1,5)
# [ 0.    0.25  0.5   0.75  1.  ]

# 重要2: np.where(a[:,0]>2) 返回矩阵a中第０列值大于２的那些行的索引号
# 返回值的样子　(array([1, 2]),)

# 重要3: 返回value最大的那个key
# print(sorted(dic, key=lambda x: dic[x])[-1])

# 重要4: np.bincount()指定最小长度
# xxx = [1,1,1,1,1]
# print np.bincount(xxx,minlength=3)
# 结果: [0 5 0]
posted on 2017-08-14 12:46 英雄与侠义的化身阅读(189) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
Spurs

公告