【机器学习】决策树

Decision Tree

熵 - entropy

数学表达式

\[H(p_1) = -p_1 \text{log}_2(p_1) - (1- p_1) \text{log}_2(1- p_1) \]

代码

# UNQ_C1
# GRADED FUNCTION: compute_entropy

def compute_entropy(y):
    """
    Computes the entropy for 
    
    Args:
       y (ndarray): Numpy array indicating whether each example at a node is
           edible (`1`) or poisonous (`0`)
       
    Returns:
        entropy (float): Entropy at that node
        
    """
    # You need to return the following variables correctly
    entropy = 0.
    
    ### START CODE HERE ###
    if len(y) != 0:
        # len(y[y == 1]) 统计y中等于1元素的个数 等价于 np.sum(y == 1)
        p1 = len(y[y == 1]) / len(y)
        if p1 != 0 and p1 != 1:
            entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)
    ### END CODE HERE ###        
    
    return entropy

信息增益 - information gain

数学表达式

\[\text{Information Gain} = H(p_1^\text{node})- (w^{\text{left}}H(p_1^\text{left}) + w^{\text{right}}H(p_1^\text{right})) \]

代码

# UNQ_C3
# GRADED FUNCTION: compute_information_gain

def compute_information_gain(X, y, node_indices, feature):
    
    """
    Compute the information of splitting the node on a given feature
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
   
    Returns:
        cost (float):        Cost computed
    
    """    
    # Split dataset
    left_indices, right_indices = split_dataset(X, node_indices, feature)
    
    # Some useful variables
    X_node, y_node = X[node_indices], y[node_indices]
    X_left, y_left = X[left_indices], y[left_indices]
    X_right, y_right = X[right_indices], y[right_indices]
    
    # You need to return the following variables correctly
    information_gain = 0
    
    ### START CODE HERE ###
    
    # Weights 
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)
    right_entropy = compute_entropy(y_right)
    w_left = len(X_left) / len(X_node)
    w_right = len(X_right) / len(y_node)
    # Weighted entropy
    weighted_entropy = w_left * left_entropy + w_right * right_entropy
    # Information gain                                                   
    information_gain = node_entropy - weighted_entropy
    ### END CODE HERE ###  
    
    return information_gain
# UNQ_C4
# GRADED FUNCTION: get_best_split

def get_best_split(X, y, node_indices):   
    """
    Returns the optimal feature and threshold value
    to split the node data 
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.

    Returns:
        best_feature (int):     The index of the best feature to split
    """    
    
    # Some useful variables
    num_features = X.shape[1]
    
    # You need to return the following variables correctly
    best_feature = -1
    max_info_gain = 0
    ### START CODE HERE ###
    for feature in range(num_features):
        info_gain = compute_information_gain(X, y, node_indices, feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
    ### END CODE HERE ##    
   
    return best_feature
# Not graded
tree = []

def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
    """
    Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
    This function just prints the tree.
    
    Args:
        X (ndarray):            Data matrix of shape(n_samples, n_features)
        y (array like):         list or ndarray with n_samples containing the target variable
        node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
        branch_name (string):   Name of the branch. ['Root', 'Left', 'Right']
        max_depth (int):        Max depth of the resulting tree. 
        current_depth (int):    Current depth. Parameter used during recursive call.
   
    """ 

    # Maximum depth reached - stop splitting
    if current_depth == max_depth:
        formatting = " "*current_depth + "-"*current_depth
        print(formatting, "%s leaf node with indices" % branch_name, node_indices)
        return
   
    # Otherwise, get best split and split the data
    # Get the best feature and threshold at this node
    best_feature = get_best_split(X, y, node_indices) 
    tree.append((current_depth, branch_name, best_feature, node_indices))
    
    formatting = "-"*current_depth
    print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
    
    # Split the dataset at the best feature
    left_indices, right_indices = split_dataset(X, node_indices, best_feature)
    
    # continue splitting the left and the right child. Increment current depth
    build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
    build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)
import numpy as np
from sklearn.datasets import load_iris


class DecisionTree():
    def __init__(self, X_train, y_train, branch_name, max_depth) -> None:
        self.X_train = X_train
        self.y_train = y_train
        self.k = self.X_train.shape[1] # k类特征
        self.branch_name = branch_name
        self.max_depth = max_depth

    def split_dataset(self, data, featrue, value):
        '''分割数据集'''
        left_data, right_data = [], []    
        for i in range(len(data)):
            if data[i, featrue] <= value:
                left_data.append(data[i, featrue])
            else:
                right_data.append(data[i, featrue])
        return left_data, right_data

    def compute_entropy(self, data):
        '''计算熵'''
        entropy = 0.
        count = np.zeros(3) # 统计3类鸢尾花在数据集中对应的数量
        for i in range(len(data)):
            count[self.y_train[i]] += 1
        for i in range(3):
            term = count[i] / len(data)
            if term != 0:
                entropy += (term * np.log2(term))
        return -entropy
    
    def compute_information_gain(self, data, feature, value):
        '''计算信息增益'''
        # feature: 0 1 2 3 中的某个
        left_data, right_data = self.split_dataset(data, feature, value)
        left_entropy = self.compute_entropy(left_data)
        right_entropy = self.compute_entropy(right_data)
        w_left = len(left_data) / len(data)
        w_right = len(right_data) / len(data)
        weighted_entropy = w_left * left_entropy + w_right * right_entropy                                             
        information_gain = self.compute_entropy(data) - weighted_entropy
        return information_gain

    def get_best_split(self, data):
        '''决策本次划分'''   
        best_feature = -1
        max_info_gain = -1
        # 遍历四类特征,取其中信息增益最大的作为本次划分依据
        for feature in range(self.k):
            values = [row[feature] for row in data]
            for value in values:
                info_gain = self.compute_information_gain(data, feature, value)
                if info_gain > max_info_gain:
                    max_info_gain = info_gain

        return best_feature, max_info_gain

    def build_tree_recursive(self, data, branch_name, current_depth):
        if current_depth == self.max_depth:
            formatting = " "*current_depth + "-"*current_depth
            print(f"{formatting} {branch_name} leaf node with indices")
            return
        best_feature, max_info_gain, best_value = self.get_best_split(data) 
        tree.append((current_depth, branch_name, best_feature))
        
        formatting = "-"*current_depth
        print(f"{formatting} Depth {current_depth}, {branch_name}: Split on feature: {best_feature}")
        
        left_data, right_data = self.split_dataset(data, best_feature, best_value)
        
        self.build_tree_recursive(left_data, "Left", current_depth+1)
        self.build_tree_recursive(right_data, "Right", current_depth+1)

def load_data():
    iris = load_iris()
    iris_feature = iris.data
    iris_target = iris.target
    #数据进行分割(训练数据和测试数据)
    # X_train, X_test = iris_feature[:120, :], iris_feature[120:, :]
    # y_train, y_test = iris_target[:120], iris_target[120:]
    return iris_feature, iris_target

if __name__ == '__main__':
    tree = []
    # model = DecisionTree(X_train, y_train, root_indices, "Root", max_depth=2)
    # model.build_tree_recursive(root_indices, "Root", current_depth=0) 
    X_train, y_train = load_data()
    model = DecisionTree(X_train, y_train, "Root", max_depth=3)
posted @ 2023-07-31 15:46  码农要战斗  阅读(23)  评论(0编辑  收藏  举报