决策树转为hivesql语句部署
首先生成一个决策树,然后画出决策树,再将决策树条件转化成hiveSQL语句进行部署,并非所有公司有部署决策树的环境,这时候可以使用SQL去部署
首先创建一个dt_main.py,主要是决策树方面的,代码如下:

#!/usr/bin/env python # coding: utf-8 # In[ ]: #导入模块 from sklearn import tree import pandas as pd import numpy as np import os import toad from pydotplus import graph_from_dot_data from sklearn.tree import export_graphviz from six import StringIO #卡方分箱决策树 class auto_DecisionTreeRegressor(object): def __init__(self,datasets,ex_lis,dep='bad_ind',min_samples=0.05,min_samples_leaf=200,min_samples_split=20,max_depth=4,is_bin=True): ''' datasets:数据集 dataframe格式 ex_lis:不参与建模的特征,如id,时间切片等。 list格式 min_samples:分箱时最小箱的样本占总比 numeric格式 max_depth:决策树最大深度 numeric格式 min_samples_leaf:决策树子节点最小样本个数 numeric格式 min_samples_split:决策树划分前,父节点最小样本个数 numeric格式 is_bin:是否进行卡方分箱 bool格式(True/False) ''' self.datasets = datasets self.ex_lis = ex_lis self.dep = dep self.max_depth = max_depth self.min_samples = min_samples self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.is_bin = is_bin self.bins = 0 def fit_plot(self): os.environ["PATH"] += os.pathsep + 'D:/Program Files' dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, min_samples_split=self.min_samples_split) x = self.datasets.drop(self.ex_lis,axis=1) y = self.datasets[self.dep] combiner = 0 if self.is_bin: #分箱 combiner = toad.transform.Combiner() combiner.fit(x,y,method='chi',min_samples = self.min_samples) x_bin= combiner.transform(x) self.bins = combiner.export() else: x_bin = x.copy() dtree = dtree.fit(x_bin,y) df_bin = x_bin.copy() df_bin[self.dep] = y dot_data = StringIO() tree.export_graphviz(dtree, out_file=dot_data, feature_names=x_bin.columns, class_names=[self.dep], filled=True, rounded=True, special_characters=True) graph = graph_from_dot_data(dot_data.getvalue()) png = graph.create_png() #返回字典 dct = { 'df_bin': df_bin, 'bins':self.bins, 'combiner':combiner, 'graph':png, 'dtree':dtree, 'feature_names':x_bin.columns } return dct
再创建一个Decision_utils.py,找到决策树的叶子节点并抽取决策路径到SQL中。因为是一个重新写的demo,如果公司实际生产环境,还需要在脚本中使用cast(features as flaot) 之类的函数,就自己添加
代码如下:

#!/usr/bin/env python # coding: utf-8 import toad import pandas as pd import numpy as np import pydotplus from IPython.display import Image from six import StringIO import os from sklearn import tree def DTR_TO_SQL(tree, feature_names, when_else=-1): import numpy as np left = tree.tree_.children_left right = tree.tree_.children_right threshold = tree.tree_.threshold features = [feature_names[i] for i in tree.tree_.feature] le='<=' g ='>' idx = np.argwhere(left == -1)[:,0] def find_value(tree): value_lis=[] tree_ = tree.tree_ def recurse(node, depth): from sklearn.tree import _tree if tree_.feature[node] != _tree.TREE_UNDEFINED: recurse(tree_.children_left[node], depth + 1) recurse(tree_.children_right[node], depth + 1) else: value_lis.append(tree_.value[node][0][0]) recurse(0, 1) return value_lis value_lis = find_value(tree) def recurse(left, right, child, lineage=None): if lineage is None: lineage = [child] if child in left: parent = np.where(left == child)[0].item() split = 'l' else: parent = np.where(right == child)[0].item() split = 'r' lineage.append((parent, split, threshold[parent], features[parent])) if parent == 0: lineage.reverse() return lineage else: return recurse(left, right, parent, lineage) print('case ') for j,child in enumerate(idx): clause=' when ' for node in recurse(left, right, child): if len(str(node))<3: continue i=node if i[1]=='l': sign=le else: sign=g clause=clause+i[3]+sign+str(i[2])+' and ' clause=clause[:-4]+' then '+ str(value_lis[j]) print(clause) print('else %s end as clusters'%(when_else))
将这二者放在同一文件夹里面,然后再在此文件夹打开一个notebook,运行下面代码
import pandas as pd import sys sys.path.append('/函数/automl/') from sklearn.datasets import make_classification from dt_main import auto_DecisionTreeRegressor from Decision_utils import DTR_TO_SQL from IPython.display import Image X, y = make_classification(n_samples=1000,n_features=30,n_classes=2,random_state=328) data = pd.DataFrame(X) data['bad_ind'] = y data['imei'] = [i for i in range(len(data))] data.columns = ['f0_radius','f0_texture','f0_perimeter','f0_area','f0_smoothness', 'f0_compactness','f0_concavity','f0_concave_points','f0_symmetry', 'f0_fractal_dimension','f1_radius_error','f1_texture_error','f1_perimeter_error', 'f2_area_error','f2_smoothness_error','f2_compactness_error','f2_concavity_error', 'f2_concave_points_error','f2_symmetry_error','f2_fractal_dimension_error', 'f3_radius','f3_texture','f3_perimeter','f3_area','f3_smoothness', 'f3_compactness','f3_concavity','f3_concave_points','f3_symmetry', 'f3_fractal_dimension','bad_ind','imei'] uid, dep = "imei", "bad_ind" var_names = list(data.columns) var_names.remove(dep) var_names.remove(uid) #指定不参与建模的变量,包含标签bad_ind。 ex_lis = [uid, dep] #调用决策树函数 dct = auto_DecisionTreeRegressor(datasets = data,ex_lis = ex_lis,is_bin=False, dep = dep,min_samples=0.01,max_depth=4, min_samples_leaf=50,min_samples_split=50).fit_plot() #展示图像 Image(dct['graph'])
如果需要查看SQL:
#查看SQL代码 DTR_TO_SQL(dct['dtree'],dct['feature_names'],when_else=0)
case when f0_symmetry<=-0.028780914843082428 and f0_symmetry<=-0.6725731492042542 and f2_smoothness_error<=0.06367828324437141 then 0.08433734939759036 when f0_symmetry<=-0.028780914843082428 and f0_symmetry<=-0.6725731492042542 and f2_smoothness_error>0.06367828324437141 and f2_concave_points_error<=0.9470688104629517 then 0.0 when f0_symmetry<=-0.028780914843082428 and f0_symmetry<=-0.6725731492042542 and f2_smoothness_error>0.06367828324437141 and f2_concave_points_error>0.9470688104629517 then 0.03773584905660377 when f0_symmetry<=-0.028780914843082428 and f0_symmetry>-0.6725731492042542 then 0.2857142857142857 when f0_symmetry>-0.028780914843082428 and f3_symmetry<=0.34790952503681183 and f2_symmetry_error<=-1.2111823558807373 then 0.96 when f0_symmetry>-0.028780914843082428 and f3_symmetry<=0.34790952503681183 and f2_symmetry_error>-1.2111823558807373 and f2_smoothness_error<=0.37725594639778137 then 1.0 when f0_symmetry>-0.028780914843082428 and f3_symmetry<=0.34790952503681183 and f2_symmetry_error>-1.2111823558807373 and f2_smoothness_error>0.37725594639778137 then 0.9770114942528736 when f0_symmetry>-0.028780914843082428 and f3_symmetry>0.34790952503681183 then 0.26 else 0 end as clusters
文章参考:https://zhuanlan.zhihu.com/p/188759754
修改了一些错误,使得整个程序得以顺利运行
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人