python实现ID3

# -*- coding: utf-8 -*-
#计算各个属性各个值的嫡
import numpy as np
def H(tdata):
    n = tdata.shape[1] -1
    C = tdata.ix[:,n]
    result = 0
    counts = list(C.value_counts())
    for i in range(len(counts)):
        p = counts[i]/len(C)
        result = result + p*np.log2(p)
    print('H')
    return result
        
#计算各个属性的条件嫡
def tiaojiandi(dataset,T):
    #按照T划分数据集
    xiaodi = 0
    for i in dataset[T].unique():
        tdata = dataset[dataset[T]==i]
        p = len(tdata)/len(dataset)
        xiaodi = xiaodi +p*H(tdata)    
    print('tiaojiandi')
    return -xiaodi


#计算最大信息增益的属性
def maxgain(dataset):
    gain=[]
    n = dataset.shape[1] -1
    features = list(dataset.columns[0:n])
    for i in range(len(features)):
        di = tiaojiandi(dataset,features[i])
        gain.append(di)
    gain = np.array(gain)
    print('maxgain')
    return features[gain.argmin()]


#获得属性后,拆分数据集
def split(dataset, feature, value):
    newdata = dataset[dataset[feature]==value]
    del newdata[feature]
    print('split')
    return newdata

#若属性为空时,结果多的为终结点
def classfiy(C):
    counts = C.value_counts().sort_index()
    print('classfiy')
    return str(counts.index[-1])

#创建决策树
def decision_tree(dataset):
    n = dataset.shape[1] -1
    features = list(dataset.columns[0:n])
    C = list(dataset.ix[:,n])
    if C.count(C[0]) == len(C):
        return C[0]
    if len(features)==0:
        return classfiy(dataset.ix[:,n])
    feature = maxgain(dataset)
    tree={feature:{}}
    for value in dataset[feature].unique():
        print('ok')
        newdata = split(dataset,feature,value)
        tree[feature][value] = decision_tree(newdata)
    return tree



import pandas as pd
train = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\train.csv')
tree = decision_tree(train) 

#预测结果
def predict(tree,test):
    result = []
    for i in range(len(test)):
        newdata = test.ix[i,0:4].to_dict()
        while isinstance(tree,dict):
            key = list(tree.keys())[0]
            tree = tree[key][newdata[key]]
        result.append(tree)
    print(result)
    return result

#计算准确率
def pinggu(tree, test):
    result = predict(tree,test)
    test['result']=result
    return len(test[test['Play']==test['result']])/len(test)

test = pd.read_csv(r'E:\Python\machine learning\own\decision_tree\test.csv')
accuary = pinggu(tree,test)


   

 

posted @ 2017-07-25 16:43  草莓干123456  阅读(1086)  评论(0编辑  收藏  举报