python实现ID3
# -*- coding: utf-8 -*- #计算各个属性各个值的嫡 import numpy as np def H(tdata): n = tdata.shape[1] -1 C = tdata.ix[:,n] result = 0 counts = list(C.value_counts()) for i in range(len(counts)): p = counts[i]/len(C) result = result + p*np.log2(p) print('H') return result #计算各个属性的条件嫡 def tiaojiandi(dataset,T): #按照T划分数据集 xiaodi = 0 for i in dataset[T].unique(): tdata = dataset[dataset[T]==i] p = len(tdata)/len(dataset) xiaodi = xiaodi +p*H(tdata) print('tiaojiandi') return -xiaodi #计算最大信息增益的属性 def maxgain(dataset): gain=[] n = dataset.shape[1] -1 features = list(dataset.columns[0:n]) for i in range(len(features)): di = tiaojiandi(dataset,features[i]) gain.append(di) gain = np.array(gain) print('maxgain') return features[gain.argmin()] #获得属性后,拆分数据集 def split(dataset, feature, value): newdata = dataset[dataset[feature]==value] del newdata[feature] print('split') return newdata #若属性为空时,结果多的为终结点 def classfiy(C): counts = C.value_counts().sort_index() print('classfiy') return str(counts.index[-1]) #创建决策树 def decision_tree(dataset): n = dataset.shape[1] -1 features = list(dataset.columns[0:n]) C = list(dataset.ix[:,n]) if C.count(C[0]) == len(C): return C[0] if len(features)==0: return classfiy(dataset.ix[:,n]) feature = maxgain(dataset) tree={feature:{}} for value in dataset[feature].unique(): print('ok') newdata = split(dataset,feature,value) tree[feature][value] = decision_tree(newdata) return tree import pandas as pd train = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\train.csv') tree = decision_tree(train) #预测结果 def predict(tree,test): result = [] for i in range(len(test)): newdata = test.ix[i,0:4].to_dict() while isinstance(tree,dict): key = list(tree.keys())[0] tree = tree[key][newdata[key]] result.append(tree) print(result) return result #计算准确率 def pinggu(tree, test): result = predict(tree,test) test['result']=result return len(test[test['Play']==test['result']])/len(test) test = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\test.csv') accuary = pinggu(tree,test)