【机器学习】决策树
Decision Tree
熵 - entropy
数学表达式
\[H(p_1) = -p_1 \text{log}_2(p_1) - (1- p_1) \text{log}_2(1- p_1)
\]
代码
# UNQ_C1
# GRADED FUNCTION: compute_entropy
def compute_entropy(y):
"""
Computes the entropy for
Args:
y (ndarray): Numpy array indicating whether each example at a node is
edible (`1`) or poisonous (`0`)
Returns:
entropy (float): Entropy at that node
"""
# You need to return the following variables correctly
entropy = 0.
### START CODE HERE ###
if len(y) != 0:
# len(y[y == 1]) 统计y中等于1元素的个数 等价于 np.sum(y == 1)
p1 = len(y[y == 1]) / len(y)
if p1 != 0 and p1 != 1:
entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)
### END CODE HERE ###
return entropy
信息增益 - information gain
数学表达式
\[\text{Information Gain} = H(p_1^\text{node})- (w^{\text{left}}H(p_1^\text{left}) + w^{\text{right}}H(p_1^\text{right}))
\]
代码
# UNQ_C3
# GRADED FUNCTION: compute_information_gain
def compute_information_gain(X, y, node_indices, feature):
"""
Compute the information of splitting the node on a given feature
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
cost (float): Cost computed
"""
# Split dataset
left_indices, right_indices = split_dataset(X, node_indices, feature)
# Some useful variables
X_node, y_node = X[node_indices], y[node_indices]
X_left, y_left = X[left_indices], y[left_indices]
X_right, y_right = X[right_indices], y[right_indices]
# You need to return the following variables correctly
information_gain = 0
### START CODE HERE ###
# Weights
node_entropy = compute_entropy(y_node)
left_entropy = compute_entropy(y_left)
right_entropy = compute_entropy(y_right)
w_left = len(X_left) / len(X_node)
w_right = len(X_right) / len(y_node)
# Weighted entropy
weighted_entropy = w_left * left_entropy + w_right * right_entropy
# Information gain
information_gain = node_entropy - weighted_entropy
### END CODE HERE ###
return information_gain
# UNQ_C4
# GRADED FUNCTION: get_best_split
def get_best_split(X, y, node_indices):
"""
Returns the optimal feature and threshold value
to split the node data
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
best_feature (int): The index of the best feature to split
"""
# Some useful variables
num_features = X.shape[1]
# You need to return the following variables correctly
best_feature = -1
max_info_gain = 0
### START CODE HERE ###
for feature in range(num_features):
info_gain = compute_information_gain(X, y, node_indices, feature)
if info_gain > max_info_gain:
max_info_gain = info_gain
best_feature = feature
### END CODE HERE ##
return best_feature
# Not graded
tree = []
def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
"""
Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
This function just prints the tree.
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
branch_name (string): Name of the branch. ['Root', 'Left', 'Right']
max_depth (int): Max depth of the resulting tree.
current_depth (int): Current depth. Parameter used during recursive call.
"""
# Maximum depth reached - stop splitting
if current_depth == max_depth:
formatting = " "*current_depth + "-"*current_depth
print(formatting, "%s leaf node with indices" % branch_name, node_indices)
return
# Otherwise, get best split and split the data
# Get the best feature and threshold at this node
best_feature = get_best_split(X, y, node_indices)
tree.append((current_depth, branch_name, best_feature, node_indices))
formatting = "-"*current_depth
print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
# Split the dataset at the best feature
left_indices, right_indices = split_dataset(X, node_indices, best_feature)
# continue splitting the left and the right child. Increment current depth
build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)
import numpy as np
from sklearn.datasets import load_iris
class DecisionTree():
def __init__(self, X_train, y_train, branch_name, max_depth) -> None:
self.X_train = X_train
self.y_train = y_train
self.k = self.X_train.shape[1] # k类特征
self.branch_name = branch_name
self.max_depth = max_depth
def split_dataset(self, data, featrue, value):
'''分割数据集'''
left_data, right_data = [], []
for i in range(len(data)):
if data[i, featrue] <= value:
left_data.append(data[i, featrue])
else:
right_data.append(data[i, featrue])
return left_data, right_data
def compute_entropy(self, data):
'''计算熵'''
entropy = 0.
count = np.zeros(3) # 统计3类鸢尾花在数据集中对应的数量
for i in range(len(data)):
count[self.y_train[i]] += 1
for i in range(3):
term = count[i] / len(data)
if term != 0:
entropy += (term * np.log2(term))
return -entropy
def compute_information_gain(self, data, feature, value):
'''计算信息增益'''
# feature: 0 1 2 3 中的某个
left_data, right_data = self.split_dataset(data, feature, value)
left_entropy = self.compute_entropy(left_data)
right_entropy = self.compute_entropy(right_data)
w_left = len(left_data) / len(data)
w_right = len(right_data) / len(data)
weighted_entropy = w_left * left_entropy + w_right * right_entropy
information_gain = self.compute_entropy(data) - weighted_entropy
return information_gain
def get_best_split(self, data):
'''决策本次划分'''
best_feature = -1
max_info_gain = -1
# 遍历四类特征,取其中信息增益最大的作为本次划分依据
for feature in range(self.k):
values = [row[feature] for row in data]
for value in values:
info_gain = self.compute_information_gain(data, feature, value)
if info_gain > max_info_gain:
max_info_gain = info_gain
return best_feature, max_info_gain
def build_tree_recursive(self, data, branch_name, current_depth):
if current_depth == self.max_depth:
formatting = " "*current_depth + "-"*current_depth
print(f"{formatting} {branch_name} leaf node with indices")
return
best_feature, max_info_gain, best_value = self.get_best_split(data)
tree.append((current_depth, branch_name, best_feature))
formatting = "-"*current_depth
print(f"{formatting} Depth {current_depth}, {branch_name}: Split on feature: {best_feature}")
left_data, right_data = self.split_dataset(data, best_feature, best_value)
self.build_tree_recursive(left_data, "Left", current_depth+1)
self.build_tree_recursive(right_data, "Right", current_depth+1)
def load_data():
iris = load_iris()
iris_feature = iris.data
iris_target = iris.target
#数据进行分割(训练数据和测试数据)
# X_train, X_test = iris_feature[:120, :], iris_feature[120:, :]
# y_train, y_test = iris_target[:120], iris_target[120:]
return iris_feature, iris_target
if __name__ == '__main__':
tree = []
# model = DecisionTree(X_train, y_train, root_indices, "Root", max_depth=2)
# model.build_tree_recursive(root_indices, "Root", current_depth=0)
X_train, y_train = load_data()
model = DecisionTree(X_train, y_train, "Root", max_depth=3)