F#真棒 - 决策树 – 第二部分
在我的前一篇文章中过了一遍ID3 算法后面的理论。现在我们在此基础上得到了那个痛苦的数学公式,让我们写一些代码吧!这里是一个用F#实现了的算法(同时在文章底部有附件下载)。
open System type Record = { Outlook : string Temperature : string Humidity : string Wind : string PlayTennis : bool } /// Given an attribute name return its value member this.GetAttributeValue(attrName) = match attrName with | "Outlook" -> this.Outlook | "Temperature" -> this.Temperature | "Humidity" -> this.Humidity | "Wind" -> this.Wind | _ -> failwithf "Invalid attribute name '%s'" attrName /// Make the %o format specifier look all pretty like override this.ToString() = sprintf "{Outlook = %s, Temp = %s, Humidity = %s, Wind = %s, PlayTennis = %b}" this.Outlook this.Temperature this.Humidity this.Wind this.PlayTennis type DecisionTreeNode = // Attribute name and value / child node list | DecisionNode of string * (string * DecisionTreeNode) seq // Decision and corresponding evidence | Leaf of bool * Record seq // ---------------------------------------------------------------------------- /// Return the total true, total false, and total count for a set of Records let countClassifications data = Seq.fold (fun (t,f,c) item -> match item.PlayTennis with | true -> (t + 1, f, c + 1) | false -> (t, f + 1, c + 1)) (0, 0, 0) data // ---------------------------------------------------------------------------- /// Return the theoretical number of bits required to classify the information. /// If a 50/50 mix, returns 1, if 100% true or false returns 0. let entropy data = let (trueValues, falseValues, totalCount) = countClassifications data let probTrue = (float trueValues) / (float totalCount) let probFalse = (float falseValues) / (float totalCount) // Log2(1.0) = infinity, short circuiting this part if trueValues = totalCount || falseValues = totalCount then 0.0 else -probTrue * Math.Log(probTrue, 2.0) + -probFalse * Math.Log(probFalse, 2.0) /// Given a set of data, how many bits do you save if you know the provided attribute. let informationGain (data : Record seq) attr = // Partition the data into new sets based on each unique value of the given attribute // e.g. [ where Outlook = rainy ], [ where Outlook = overcast], [ ... ] let divisionsByAttribute = data |> Seq.groupBy(fun item -> item.GetAttributeValue(attr)) let totalEntropy = entropy data let entropyBasedOnSplit = divisionsByAttribute |> Seq.map(fun (attributeValue, rowsWithThatValue) -> let ent = entropy rowsWithThatValue let percentageOfTotalRows = (float <| Seq.length rowsWithThatValue) / (float <| Seq.length data) -1.0 * percentageOfTotalRows * ent) |> Seq.sum totalEntropy + entropyBasedOnSplit // ---------------------------------------------------------------------------- /// Give a list of attributes left to branch on and training data, /// construct a decision tree node. let rec createTreeNode data attributesLeft = let (totalTrue, totalFalse, totalCount) = countClassifications data // If we have tested all attributes, then label this node with the // most often occuring instance; likewise if everything has the same value. if List.length attributesLeft = 0 || totalTrue = 0 || totalFalse = 0 then let mostOftenOccuring = if totalTrue > totalFalse then true else false Leaf(mostOftenOccuring, data) // Otherwise, create a proper decision tree node and branch accordingly else let attributeWithMostInformationGain = attributesLeft |> List.map(fun attrName -> attrName, (informationGain data attrName)) |> List.maxBy(fun (attrName, infoGain) -> infoGain) |> fst let remainingAttributes = attributesLeft |> List.filter ((<>) attributeWithMostInformationGain) // Partition that data base on the attribute's values let partitionedData = Seq.groupBy (fun (r : Record) -> r.GetAttributeValue(attributeWithMostInformationGain)) data // Create child nodes let childNodes = partitionedData |> Seq.map (fun (attrValue, subData) -> attrValue, (createTreeNode subData remainingAttributes)) DecisionNode(attributeWithMostInformationGain, childNodes)
entropy和informationGain这两个函数在上篇文章中已经提到过了,所以让我们来过一遍这个实际的决策树是怎样构建的。这里还有点小工作,即计算出最佳的决策树分割点,然后用F#你能非常美观地用代码表达它。
let attributeWithMostInformationGain = attributesLeft |> List.map(fun attrName -> attrName, (informationGain data attrName)) |> List.maxBy(fun (attrName, infoGain) -> infoGain) |> fst
首先,它需要所有分裂的潜在属性...
attributesLeft
...然后匹配这个属性名字到一个新的属性名字和一个信息元组...
|> List.map(fun attrName -> attrName, (informationGain data attrName))
...再从新生成的列表中取出那个最高信息增益的元组…
|> List.maxBy(fun (attrName, infoGain) -> infoGain)
...最后返回该元组的第一个元素,即有最高信息增益的属性。
|> fst
当你能在内存里构成一个决策树时,怎样获取它呢?最简单的方法是在控制台输出它。
这里的代码非常的直接了当。注意"padding parameter"的使用,这使递归调用获得越来越多的缩进。在控制台打印出树类型数据结构时,这是一个非常有帮助的技术。
/// Print the decision tree to the console let rec printID3Result indent node = let padding = new System.String(' ', indent) match node with | Leaf(classification, data) -> printfn "\tClassification = %b" classification // data |> Seq.iter (fun item -> printfn "%s->%s" padding <| item.ToString()) | DecisionNode(attribute, childNodes) -> printfn "" // Finish previous line printfn "%sBranching on attribute [%s]" padding attribute childNodes |> Seq.iter (fun (attrValue, childNode) -> printf "%s->With value [%s]..." padding attrValue printID3Result (indent + 4) childNode)
然而现在都快2010年了,因此对于代替飞行汽车而言,或许我们至少可以比控制台打印出数据做得更好。理想情况是我们可以生成出一些像这样的性感图像:
你可以用Microsoft Visio精心构造这决策树,但幸运地是已经有现成工具为你做这些工作了。AT&T研究已经生产出了一个很棒的工具叫GraphViz。虽然最终结果并不是完全没有瑕疵,但这已足够简单去进行下去了。
下面这个函数把决策树转换到一个GraphViz能够进行绘图的格式(复制打印出来的内容到该工具,然后用默认设置进行绘图)。
/// Prints the tree in a format amenable to GraphViz /// See http://www.graphviz.org/ for more format let printInGraphVizFormat node = let rec printNode parentName name node = match node with | DecisionNode(attribute, childNodes) -> // Print the decision node printfn "\"%s\" [ label = \"%s\" ];" (parentName + name) attribute // Print link from parent to this node (unless it's the root) if parentName <> "" then printfn "\"%s\" -> \"%s\" [ label = \"%s\" ];" parentName (parentName + name) name childNodes |> Seq.iter(fun (attrValue, childNode) -> printNode (parentName + name) attrValue childNode) | Leaf(classification, _) -> let label = match classification with | true -> "Yes" | false -> "No" // Print the decision node printfn "\"%s\" [ label = \"%s\" ];" (parentName + name) label // Print link from parent to this node printfn "\"%s\" -> \"%s\" [ label = \"%s\" ];" parentName (parentName + name) name printfn "digraph g {" printNode "" "root" node printfn "}"
现在你有了F# 的ID3. 用很少一些数学和一些聪明的输出,你可以为你所有的机器学习需求构造决策树。在你以后的工作中,如你想挖掘客户的交易、分析服务器日志、或为你的杀手机器人编写程序去找到沙拉康纳(电影终结者里的女主角)。
<完全无耻的插入>如果你想学习更多F#,就去看下O'Reilly的Programming F#(在F#学习资料汇总里有下载)</完全无耻的插入>
原文链接 :http://blogs.msdn.com/b/chrsmith/archive/2009/11/02/awesome-f-decision-trees-part-ii.aspx