Machine Learning in action --CART--regTrees.py

最近在自学机器学习,应导师要求,先把《Machine Learning with R》动手刷了一遍,感觉R真不能算是一门计算机语言,感觉也就是一个功能复杂的计算器。所以这次就决定使用经典教材《Machine Learning in action》。因为开学得换work station ,怕到时候代码又丢了,所以就索性开个博客,把代码上传上来。

因为书上的原代码有很多错误,并且网上的许多博客的代码也是没有改正的,这次我把修正过的代码po上来

version:python3.5

talk is cheap show me the code

函数定义代码


#coding=utf-8
'''
Created on Aug 19, 2017
Tree-Based Regression Methods
@author: Edgis
'''
from numpy import *
from tkinter import *
import matplotlib.pyplot as plt
import matplotlib



def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    lines = fr.readlines()
    for line in lines:
        curLine = line.strip().split('\t')
        #将 每一行映射成浮点数
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0], :]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0], :]
    return mat0, mat1

#求总方差
def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def regLeaf(dataSet):
    return mean(dataSet[:,-1])                      #返回叶节点,回归树中的目标变量的均值


def createTree(dataSet, leafType = regLeaf, errType = regErr, ops=(1,4)):
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)        #将数据集进行切分
    if feat == None:
        return val
    else:
        retTree = {}
        retTree["spIndex"] = feat
        retTree["spVal"] = val
        lSet, rSet = binSplitDataSet(dataSet, feat, val)
        retTree["left"]  = createTree(lSet, leafType, errType, ops)     #递归切分
        retTree["right"] = createTree(rSet, leafType, errType, ops)
        return retTree

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0] ; tolN = ops[1]
    # 如果剩余特征值的数目为1,那么就不再切分而返回
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
        return None, leafType(dataSet)
    else:
        m , n= shape(dataSet)
        S = errType(dataSet)
        bestS = inf
        bestIndex = 0
        bestValue = 0
        for featIndex in range(n-1): #对特征进行遍历
            for splitVal in set((dataSet[:,featIndex].T.A.tolist())[0]):   #对特征值进行遍历 set()-->convert list to dict
                mat0 , mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
                if (shape(mat0)[0] <tolN ) or (shape(mat1)[0] < tolN):
                    continue
                newS = errType(mat0) + errType(mat1)

                if newS < bestS :
                    bestIndex = featIndex
                    bestValue = splitVal
                    bestS = newS
        if (S - bestS) < tolS:  #假如误差不大,则退出
            return None, leafType(dataSet)
        mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
        if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
            return None, leafType(dataSet)
        return bestIndex, bestValue

def isTree(obj):
    return (type(obj).__name__ == 'dict')

def getMean(tree):
    if isTree(tree['right']):
        tree['right'] = getMean(tree['right'])
    if isTree(tree['left']):
        tree['left'] = getMean(tree['left'])
    return (tree['left'] + tree['right'])/ 2.0

def prune(tree, testData):
    if shape(testData)[0] == 0:
        return getMean(tree)
    if (isTree(tree['right']) or isTree(tree['left'])):# left or right 是树结构
        lSet, rSet = binSplitDataSet(testData, tree['spIndex'], tree['spVal'])
    if isTree(tree['left']):
        tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']):
        tree['right'] = prune(tree['right'], rSet)
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spIndex'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'], 2)) +\
            sum(power(rSet[:,-1] - tree['right'], 2))
        treeMean = (tree['left'] + tree['right'])/2.0
        errorMerge = sum(power(testData[:, -1] - treeMean , 2))
        if errorMerge < errorNoMerge :
            print("merging")
            return treeMean
        else:
            return tree
    else:
        return tree

def linearSolve(dataSet):
    m, n = shape(dataSet)
    # X, Y 格式化
    X = mat(ones((m, n))) ; Y = mat(ones((m, 1)))
    X[:, 1:n] = dataSet[:, 0:n-1]
    Y = dataSet[:, -1]
    xTx = X.T * X
    if linalg.det(xTx) == 0:
        raise NameError('This matrix is singular, cannot do inverse, ','\n'
                        ,'try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws, X, Y

def modelLeaf(dataSet):
    ws, X, Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws ,X, Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat , 2))

def regTreeEval(model, inDat):
    return float(model)

def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1, n+1)))
    X[:, 1:n+1] = inDat
    return float(X * model)

def treeForeCast(tree, inDat, modelEval = regTreeEval):
    if not isTree(tree):
        return modelEval(tree, inDat)
    if inDat[tree['spIndex']] > tree['spVal']:
        if isTree(tree['left']):
            return treeForeCast(tree['left'], inDat, modelEval)
        else:
            return modelEval(tree['left'], inDat)
    else:
        if isTree(tree['right']):
            return treeForeCast(tree['right'], inDat, modelEval)
        else:
            return modelEval(tree['right'], inDat)

def createForeCast(tree, testData, modelEval = regTreeEval):
    m = len(testData)
    yHat = mat(zeros((m, 1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
    print("yHat : ",'\n',yHat)
    return yHat

上面代码块只是定义了主要的函数,离运行还差一点。由于书原文中,采用了使用 iPython 命令行的运行方式,但是博主比较懒,所以干脆舍弃掉原来的方式。

废话不多少,直接上代码

实验1

if __name__=="__main__":
    testMat = mat(eye(4))
    print("testMat:",'\n',testMat)
    mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
    print("mat0",'\n',mat0)
    print("mat1",'\n',mat1)

实验2 :

if __name__ == "__main__":
    myDat = loadDataSet('ex0.txt')
    myDat = mat(myDat)
    print("myDat ",'\n',myDat)
    myMat = mat(myDat)
    print("myMat ",'\n',myMat)
    createTree(myMat)
    print("createTree(myMat) ",'\n',createTree(myMat))

实验3 :

if __name__ == "__main__":
    myDat = loadDataSet('ex2.txt')
    myDat = mat(myDat)
    print("myDat ", '\n', myDat)
    myMat = mat(myDat)
    print("myMat ", '\n', myMat)
    createTree(myMat, ops=(10000,4))
    print("createTree(myMat) ", '\n', createTree(myMat, ops=(10,4)))

实验4 :

if __name__ == "__main__":
    myDat2 = loadDataSet('ex2.txt')
    myDat2 = mat(myDat2)
    myTree = createTree(myDat2, ops=(0,1))
    print("myTree " ,myTree)
    myDatTest = loadDataSet('ex2test.txt')
    myMat2Test = mat(myDatTest)
    print("pruned Tree", prune(myTree, myMat2Test))

实验5 :

if __name__ == "__main__":
    myDat2 = loadDataSet('exp2.txt')
    myMat2 = mat(myDat2)
    xcord1 = myMat2[:, 0]
    ycord1 = myMat2[:, -1]
    print(myMat2)
    myTree = createTree(myMat2, modelLeaf, modelErr, (1, 10))

实验6 :

if __name__ == "__main__":
     #回归树
    trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))
    testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))
    myTree = createTree(trainMat, ops = (1, 20))
    yHat = createForeCast(myTree, testMat[:,0])
    print('回归树: ',corrcoef(yHat, testMat[:,1], rowvar=0)[0,1])
    #模型树
    myTree1 = createTree(trainMat, modelLeaf, modelErr, (1,20))
    yHat = createForeCast(myTree1, testMat[:,0], modelTreeEval)
    print('模型树: ',corrcoef(yHat, testMat[:,1], rowvar=0)[0,1])
    #标准线性回归
    ws, X, Y = linearSolve(trainMat)
    for i in range(shape(testMat)[0]):
        yHat[i] = testMat[i,0] * ws[1,0] +ws[0,0]
    print('标准线性回归树: ',corrcoef(yHat, testMat[:,1], rowvar=0)[0,1])

更多请戳github
https://github.com/Edgis/Machine-learning-in-action/blob/master/regTrees.py

posted @ 2017-08-21 21:01  Edgis  阅读(125)  评论(0编辑  收藏  举报