在我的前一篇文章中过了一遍ID3 算法后面的理论。现在我们在此基础上得到了那个痛苦的数学公式,让我们写一些代码吧!这里是一个用F#实现了的算法(同时在文章底部有附件下载)

open System

type Record = 
    {
        Outlook     : string
        Temperature : string
        Humidity    : string
        Wind        : string
        PlayTennis  : bool 
    }

    /// Given an attribute name return its value
    member this.GetAttributeValue(attrName) =
        match attrName with
        | "Outlook"     -> this.Outlook
        | "Temperature" -> this.Temperature
        | "Humidity"    -> this.Humidity
        | "Wind"        -> this.Wind
        | _ -> failwithf "Invalid attribute name '%s'" attrName

    /// Make the %o format specifier look all pretty like
    override this.ToString() =
        sprintf
            "{Outlook = %s, Temp = %s, Humidity = %s, Wind = %s, PlayTennis = %b}" 
            this.Outlook
            this.Temperature 
            this.Humidity
            this.Wind
            this.PlayTennis

type DecisionTreeNode =
    // Attribute name and value / child node list    
    | DecisionNode of string * (string * DecisionTreeNode) seq
    // Decision and corresponding evidence
    | Leaf         of bool * Record seq

// ----------------------------------------------------------------------------

/// Return the total true, total false, and total count for a set of Records
let countClassifications data = 
    Seq.fold 
        (fun (t,f,c) item -> 
            match item.PlayTennis with
            | true  -> (t + 1, f, c + 1)
            | false -> (t, f + 1, c + 1))
        (0, 0, 0)
        data

// ----------------------------------------------------------------------------

/// Return the theoretical number of bits required to classify the information.
/// If a 50/50 mix, returns 1, if 100% true or false returns 0.
let entropy data = 
    let (trueValues, falseValues, totalCount) = countClassifications data        

    let probTrue  = (float trueValues)  / (float totalCount)
    let probFalse = (float falseValues) / (float totalCount)

    // Log2(1.0) = infinity, short circuiting this part
    if trueValues = totalCount || falseValues = totalCount then
        0.0
    else
        -probTrue * Math.Log(probTrue, 2.0) + -probFalse * Math.Log(probFalse, 2.0)

/// Given a set of data, how many bits do you save if you know the provided attribute.
let informationGain (data : Record seq) attr =
    
    // Partition the data into new sets based on each unique value of the given attribute
    // e.g. [ where Outlook = rainy ], [ where Outlook = overcast], [ ... ]
    let divisionsByAttribute = 
        data 
        |> Seq.groupBy(fun item -> item.GetAttributeValue(attr))

    let totalEntropy = entropy data
    let entropyBasedOnSplit =
        divisionsByAttribute
        |> Seq.map(fun (attributeValue, rowsWithThatValue) -> 
                        let ent = entropy rowsWithThatValue
                        let percentageOfTotalRows = (float <| Seq.length rowsWithThatValue) / (float <| Seq.length data)
                        -1.0 * percentageOfTotalRows * ent)
        |> Seq.sum

    totalEntropy + entropyBasedOnSplit
    
// ----------------------------------------------------------------------------

/// Give a list of attributes left to branch on and training data,
/// construct a decision tree node.
let rec createTreeNode data attributesLeft =
    
    let (totalTrue, totalFalse, totalCount) = countClassifications data

    // If we have tested all attributes, then label this node with the 
    // most often occuring instance; likewise if everything has the same value.
    if List.length attributesLeft = 0 || totalTrue = 0 || totalFalse = 0 then
        let mostOftenOccuring = 
            if totalTrue > totalFalse then true
            else                           false
        Leaf(mostOftenOccuring, data)
    
    // Otherwise, create a proper decision tree node and branch accordingly
    else
        let attributeWithMostInformationGain =
            attributesLeft 
            |> List.map(fun attrName -> attrName, (informationGain data attrName))
            |> List.maxBy(fun (attrName, infoGain) -> infoGain)
            |> fst
        
        let remainingAttributes =
            attributesLeft |> List.filter ((<>) attributeWithMostInformationGain)

        // Partition that data base on the attribute's values
        let partitionedData = 
            Seq.groupBy
                (fun (r : Record) -> r.GetAttributeValue(attributeWithMostInformationGain))
                data

        // Create child nodes
        let childNodes =
            partitionedData
            |> Seq.map (fun (attrValue, subData) -> attrValue, (createTreeNode subData remainingAttributes))

        DecisionNode(attributeWithMostInformationGain, childNodes)

entropyinformationGain这两个函数在上篇文章中已经提到过了,所以让我们来过一遍这个实际的决策树是怎样构建的。这里还有点小工作,即计算出最佳的决策树分割点,然后用F#你能非常美观地用代码表达它。

let attributeWithMostInformationGain =
    attributesLeft 
    |> List.map(fun attrName -> attrName, (informationGain data attrName))
    |> List.maxBy(fun (attrName, infoGain) -> infoGain)
    |> fst

首先,它需要所有分裂的潜在属性...

attributesLeft 

...然后匹配这个属性名字到一个新的属性名字和一个信息元组...

|> List.map(fun attrName -> attrName, (informationGain data attrName))

...再从新生成的列表中取出那个最高信息增益的元组

|> List.maxBy(fun (attrName, infoGain) -> infoGain)

...最后返回该元组的第一个元素,即有最高信息增益的属性。

|> fst

当你能在内存里构成一个决策树时,怎样获取它呢?最简单的方法是在控制台输出它。

image

这里的代码非常的直接了当。注意"padding parameter"的使用,这使递归调用获得越来越多的缩进。在控制台打印出树类型数据结构时,这是一个非常有帮助的技术。

/// Print the decision tree to the console
let rec printID3Result indent node =
    let padding = new System.String(' ', indent)

    match node with
    | Leaf(classification, data) ->
        printfn "\tClassification = %b" classification
        // data |> Seq.iter (fun item -> printfn "%s->%s" padding <| item.ToString())

    | DecisionNode(attribute, childNodes) ->
        printfn "" // Finish previous line
        printfn "%sBranching on attribute [%s]" padding attribute
        
        childNodes
        |> Seq.iter (fun (attrValue, childNode) ->
                        printf "%s->With value [%s]..." padding attrValue
                        printID3Result (indent + 4) childNode)

然而现在都快2010年了,因此对于代替飞行汽车而言,或许我们至少可以比控制台打印出数据做得更好。理想情况是我们可以生成出一些像这样的性感图像:

image

你可以用Microsoft Visio精心构造这决策树,但幸运地是已经有现成工具为你做这些工作了。AT&T研究已经生产出了一个很棒的工具叫GraphViz。虽然最终结果并不是完全没有瑕疵,但这已足够简单去进行下去了。

下面这个函数把决策树转换到一个GraphViz能够进行绘图的格式(复制打印出来的内容到该工具,然后用默认设置进行绘图)。

/// Prints the tree in a format amenable to GraphViz
/// See http://www.graphviz.org/ for more format
let printInGraphVizFormat node =

    let rec printNode parentName name node = 
        match node with
        | DecisionNode(attribute, childNodes) ->

            // Print the decision node
            printfn "\"%s\" [ label = \"%s\" ];" (parentName + name) attribute

            // Print link from parent to this node (unless it's the root)
            if parentName <> "" then
                printfn "\"%s\" -> \"%s\" [ label = \"%s\" ];" parentName (parentName + name) name

            childNodes 
            |> Seq.iter(fun (attrValue, childNode) -> 
                    printNode (parentName + name) attrValue childNode)

        | Leaf(classification, _) ->
            let label =
                match classification with
                | true  -> "Yes"
                | false -> "No"
            
            // Print the decision node
            printfn "\"%s\" [ label = \"%s\" ];" (parentName + name) label

            // Print link from parent to this node
            printfn "\"%s\" -> \"%s\" [ label = \"%s\" ];" parentName (parentName + name) name

    printfn "digraph g {"
    printNode "" "root" node
    printfn "}"

现在你有了F# ID3. 用很少一些数学和一些聪明的输出,你可以为你所有的机器学习需求构造决策树。在你以后的工作中,如你想挖掘客户的交易、分析服务器日志、或为你的杀手机器人编写程序去找到沙拉康纳(电影终结者里的女主角)。

<完全无耻的插入>如果你想学习更多F#,就去看下O'ReillyProgramming F#(F#学习资料汇总里有下载)</完全无耻的插入>

原文链接 :http://blogs.msdn.com/b/chrsmith/archive/2009/11/02/awesome-f-decision-trees-part-ii.aspx