继昨天使用sklearn的决策树分类器生成树图后,今天回到实际应用中。
生成的图片可以使我们更直观的分析树图是否有背常理以及贴近实际业务情况。
但是需要把实际树让开发人员嵌入到服务当中,由于实际生成的树叶至少好几百个甚至好几千个,
那么对于开发人员就得写好几千个if,而且模型一旦有修改对于开发人员工程量也很大。所以我们得利用
代码来生成代码。
研究dot文件
digraph Tree {
edge [fontname="SimHei"];
node [shape=box, style="filled", color="black",fontname="SimHei"] ;
graph [ranksep=equally, splines=polyline] ;
0 [label="淘宝收入笔数 <= 28.5\ngini = 0.5\nsamples = 174622\nvalue = [87311, 87311]\nclass = 坏人", fillcolor="#e5813900"] ;
1 [label="总期数 <= 4.5\ngini = 0.4941\nsamples = 115162\nvalue = [63849, 51313]\nclass = 坏人", fillcolor="#e5813932"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="信用卡透支率 <= 1.1954\ngini = 0.4991\nsamples = 42552\nvalue = [20372, 22180]\nclass = 好人", fillcolor="#399de515"] ;
1 -> 2 ;
3 [label="七天内申请平台个数 <= 5.5\ngini = 0.5\nsamples = 37625\nvalue = [18672, 18953]\nclass = 好人", fillcolor="#399de504"] ;
2 -> 3 ;
....
64 [label="淘宝交易成功次数 <= 186.5\ngini = 0.4778\nsamples = 59460\nvalue = [23462, 35998]\nclass = 好人", fillcolor="#399de559"] ;
0 -> 64 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
65 [label="花呗额度 <= 1050.0\ngini = 0.4993\nsamples = 23644\nvalue = [11367, 12277]\nclass = 好人", fillcolor="#399de513"] ;
64 -> 65 ;
66 [label="总期数 <= 4.5\ngini = 0.4978\nsamples = 12640\nvalue = [6737, 5903]\nclass = 坏人", fillcolor="#e5813920"] ;
65 -> 66 ;
67 [label="余额宝收益 <= 2.605\ngini = 0.4982\nsamples = 5410\nvalue = [2541, 2869]\nclass = 好人", fillcolor="#399de51d"] ;
66 -> 67 ;
...
{rank=same ; 0} ;
{rank=same ; 1; 64} ;
{rank=same ; 2; 33; 65; 96} ;
}
对应的树图:
可以看出,图跟文件是自上而下,先左后右发展。
- 定义树节点类
public class DecisionTreeModel
{
public int Id { get; set; }
public bool Direction { get; set; }
public string Condition { get; set; }
public decimal Gini { get; set; }
public int Samples { get; set; }
public string Classification { get; set; }
public DecisionTreeModer Parent { get; set; }
public bool IsLeave { get; set; }
public DecisionTreeModer LeftLeave { get; set; }
public DecisionTreeModer RightLeave { get; set; }
public string ToConditions(bool direction = true)
{
string conditions = string.Empty;
var match = Regex.Match(this.Condition, @"^(\w+)\s+([<=>]+)\s+([\w.\-]+)");
if (match.Success)
{
this.Condition = "data['" + match.Groups[1] + "'] " + match.Groups[2] + " " + match.Groups[3];
}
if (this.Parent != null)
{
conditions = this.Parent.ToConditions(this.Direction);
}
if (!this.IsLeave)
{
if (string.IsNullOrEmpty(conditions))
{
conditions = (direction ? string.Empty : "not ") + this.Condition;
}
else
{
conditions += " and " + (direction ? string.Empty : "not ") + this.Condition;
}
}
return conditions;
}
}
- 遍历文件的每一行,处理节点或者节点之间的关系
class DecisionTreeBuilder
{
private IDictionary<int, DecisionTreeModel> Branches = new Dictionary<int, DecisionTreeModel>();
pubilc void Do(){
string line = string.Empty;
StreamReader sr = new StreamReader("dotpath", Encoding.UTF8);
while ((line = sr.ReadLine()) != null)
{
if (Regex.IsMatch(line, @"^\d+\s\[label=""(\w+\s[<=>]+\s[\w.\-]+\\n)?gini\s=\s[\w.\-]+\\nsamples\s=\s\d+\\nvalue\s=\s\[\d+,\s\d+]\\nclass\s=\s[\w]+"", fillcolor=""#[\w]+""\]\s;$";))
{
var matchResult = Regex.Match(line, @"^(\d+)\s\[label=""(\w+\s[<=>]+\s[\w.\-]+\\n)?gini\s=\s([\w.\-]+)\\nsamples\s=\s(\d+)\\nvalue\s=\s\[\d+,\s\d+]\\nclass\s=\s([\w]+)"", fillcolor=""#[\w]+""\]\s;$");
if (matchResult.Success)
{
var mode = new DecisionTreeModer()
{
Id = int.Parse(matchResult.Groups[1].Value),
Condition = matchResult.Groups[2].Value.Replace("\\n", ""),
Gini = decimal.Parse(matchResult.Groups[3].Value),
Samples = int.Parse(matchResult.Groups[4].Value),
Classification = matchResult.Groups[5].Value
};
Branches.Add(mode.Id, mode);
}
}
else if (Regex.IsMatch(line, @"^\d+ -> \d+"))
{
var matchResult = Regex.Match(line, @"^(\d+) -> (\d+)");
if (matchResult.Success)
{
var branchId = int.Parse(matchResult.Groups[1].Value);
var leaveId = int.Parse(matchResult.Groups[2].Value);
bool direction = true;
if (Branches[branchId].LeftLeave == null)
{
Branches[branchId].LeftLeave = Branches[leaveId];
}
else
{
Branches[branchId].RightLeave = Branches[leaveId];
direction = false;
}
Branches[branchId].IsLeave = false;
Branches[leaveId].IsLeave = true;
Branches[leaveId].Direction = direction;
Branches[leaveId].Parent = Branches[branchId];
}
}
}
}
public void Prind(){
foreach (var item in Branches)
{
if (item.Value.IsLeave)
{
Console.WriteLine(item.Value.ToConditions() + "\t =>\t" + item.Value.Classification );
}
}
}
}
以上主要利用了二叉树的特点,对于应用到不同语言的语法就可以自行格式化,本文主要讲解思路。
最后来一张样图: