机器学习——建立决策树模型，使用gini、entropy指标分别分析，并对数据进行可视化，对模型进行评估，计算假正率、准确率、查全率（阿尔及利亚森林火灾）

阿尔及利亚森林火灾数据集

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier,plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

1.数据加载和分析

data = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv')
data

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	DAY	MONTH	YEAR	TEMPERATURE	RH	WS	RAIN	FFMC	DMC	DC	ISI	BUI	FWI	CLASSES
0	1	6	2012	29	57	18	0	65.7	3.4	7.6	1.3	3.4	0.5	not fire
1	2	6	2012	29	61	13	1.3	64.4	4.1	7.6	1	3.9	0.4	not fire
2	3	6	2012	26	82	22	13.1	47.1	2.5	7.1	0.3	2.7	0.1	not fire
3	4	6	2012	25	89	13	2.5	28.6	1.3	6.9	0	1.7	0	not fire
4	5	6	2012	27	77	16	0	64.8	3	14.2	1.2	3.9	0.5	not fire
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
242	26	9	2012	30	65	14	0	85.4	16	44.5	4.5	16.9	6.5	fire
243	27	9	2012	28	87	15	4.4	41.1	6.5	8	0.1	6.2	0	not fire
244	28	9	2012	27	87	29	0.5	45.9	3.5	7.9	0.4	3.4	0.2	not fire
245	29	9	2012	24	54	18	0.1	79.7	4.3	15.2	1.7	5.1	0.7	not fire
246	30	9	2012	24	64	15	0.2	67.3	3.8	16.5	1.2	4.8	0.5	not fire

247 rows × 14 columns

</div>

2.数据预处理

2.1整理数据集

#由于数据集分为两个地区，因此我们把两个地区的数据合并为一个来分析。
#因此去掉数据中的两个地区行和一个空白行
data1 = data.iloc[0:122,:]
data1
data2 = data.iloc[125:247,:]
data2
data = pd.concat([data1,data2])
data #data为整理好的数据集
feature_names = data.columns.values #提取出列名
feature_names

array(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ',
       'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes  '],
      dtype=object)

2.2分离出仅含特征列的部分作为 X 和仅含目标列的部分作为 Y

X = data.iloc[:,0:13]
Y = data.iloc[:,-1]
#整理目标列：
s=Y.values
for i in range(len(s)):
    s[i] = s[i].replace(' ','')
print(s)

['notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
 'notfire' 'notfire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
 'notfire' 'notfire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
 'fire' 'fire' 'notfire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire'
 'notfire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'notfire'
 'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
 'fire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
 'fire' 'notfire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire'
 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire'
 'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire' 'fire'
 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire']

#使用训练好的LabelEncoder对原数据进行编码,notfire为1.fire为0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(s)            
y

array([1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1])

y= pd.DataFrame(y)
y

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	0
0	1
1	1
2	1
3	1
4	1
...	...
239	0
240	1
241	1
242	1
243	1

244 rows × 1 columns

</div>

#2.使用train_test_split函数自动随机划分训练集与测试集（70%和 30%）
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)
#x_train,x_test,y_train,y_test
#查看训练集和测试集的大小
x_train.shape, x_test.shape, y_train.shape,y_test.shape

((170, 13), (74, 13), (170, 1), (74, 1))

#3.对X进行标准化处理，让模型更拟合
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train))
x_train

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	1.693550	-0.377923	0.0	-0.338241	1.164128	-0.203369	-0.414083	0.508256	1.099453	1.786169	-0.022055	1.423974	0.490624
1	-0.549241	1.406713	0.0	-0.622900	1.027785	2.027127	0.621124	-1.406602	-1.043440	-0.895966	-0.921917	-1.024626	-0.941239
2	1.221384	-1.270241	0.0	1.085051	0.005213	0.168381	-0.414083	0.679098	0.132734	-0.347347	0.552856	-0.045186	0.325409
3	-0.667282	1.406713	0.0	-0.907558	0.823271	-0.203369	0.218544	-1.335418	-0.817872	-0.900170	-0.996906	-0.870714	-0.941239
4	1.457467	0.514395	0.0	0.515734	0.141556	0.168381	-0.414083	0.792993	2.847602	3.350048	0.627845	3.207954	1.757272
...	...	...	...	...	...	...	...	...	...	...	...	...	...
165	0.040967	-1.270241	0.0	-0.907558	1.709500	-0.203369	-0.184037	-2.196748	-0.858152	-0.904374	-1.146882	-0.905694	-0.968775
166	-0.431199	0.514395	0.0	0.800393	-0.744673	-0.946867	-0.241548	0.216400	0.060230	0.506060	-0.571971	0.255642	-0.404291
167	0.395092	-1.270241	0.0	-0.053583	0.346070	-0.575118	2.173935	-0.972378	-0.842040	-0.900170	-0.946913	-0.898698	-0.927471
168	0.749217	1.406713	0.0	0.231076	0.141556	-0.946867	-0.414083	0.757401	0.906109	1.161879	0.577852	1.074174	0.903661
169	-0.903366	-0.377923	0.0	0.231076	0.414242	1.283628	-0.414083	0.522493	-0.189505	-0.025743	0.302895	-0.115146	0.118890

170 rows × 13 columns

</div>

2. 用 LogisticRegression 建立逻辑回归模型

from sklearn.linear_model import LogisticRegression
model_logic = LogisticRegression(max_iter=10000).fit(x_train, y_train)
print(model_logic.score(x_test,y_test))

0.5

D:\Anoconda\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(*args, **kwargs)

3. 构建决策树模型

3.1 定义一个决策树分类器并预测

dtc = DecisionTreeClassifier()
#调用该对象的训练方法，接收两个参数：训练数据集及其样本标签
dtc.fit(x_train,y_train)
#调用该对象的测试方法，接收一个参数：测试数据集
y_pre = dtc.predict(x_test)
#调用该对象的准确率方法，接收两个参数：测试数据集及其样本标签，返回测试集样本映射到指定分类标记上的准确率
score = dtc.score(x_test,y_test)
#输出
print("测试结果：",y_pre)  
print("正确结果：",y_test)
print("正确率：",score)

测试结果： [0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1
 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1]
正确结果：      0
67   0
243  1
206  0
122  1
89   0
..  ..
158  0
99   1
173  0
176  1
95   1

[74 rows x 1 columns]
正确率： 0.9864864864864865

3.2 构建决策树模型

#分别取分裂节点为“gini”基尼指标和“entropy”信息增益，构建决策树模型，并设置树深为5
#gini指标的决策树模型
dt_gini = DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=0)
dt_gini = dt_gini.fit(x_train,y_train)   #用训练集进行训练
#entropy指标的决策树模型
dt_entropy = DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=0)
dt_entropy = dt_entropy.fit(x_train,y_train)
#查看两者的模型参数
dt_gini,dt_entropy

(DecisionTreeClassifier(max_depth=5, random_state=0),
 DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0))

3.3 数据可视化

plt.figure(figsize=(25,30))   #设置图形大小宽为25英寸，高为30英寸
#gini模型
plot_tree(dt_gini,filled = True,feature_names = feature_names)    #设置自动填充颜色
#entropy模型
plot_tree(dt_entropy,filled = True,feature_names = feature_names)    #设置自动填充颜色

[Text(558.0, 1467.72, 'FFMC <= 80.1\nentropy = 0.974\nsamples = 170\nvalue = [101, 69]'),
 Text(279.0, 1141.56, 'entropy = 0.0\nsamples = 67\nvalue = [0, 67]'),
 Text(837.0, 1141.56, 'ISI <= 3.05\nentropy = 0.138\nsamples = 103\nvalue = [101, 2]'),
 Text(558.0, 815.4000000000001, 'Temperature <= 33.5\nentropy = 0.764\nsamples = 9\nvalue = [7, 2]'),
 Text(279.0, 489.24, 'entropy = 0.0\nsamples = 5\nvalue = [5, 0]'),
 Text(837.0, 489.24, 'DC <= 82.75\nentropy = 1.0\nsamples = 4\nvalue = [2, 2]'),
 Text(558.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [0, 2]'),
 Text(1116.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(1116.0, 815.4000000000001, 'entropy = 0.0\nsamples = 94\nvalue = [94, 0]')]

png

3.4 计算训练误差

# 对gini指标的模型计算训练误差
gini_train_score = dt_gini.score(x_train,y_train)
print("gini指标的训练误差为：",gini_train_score)
#对entropy信息增益的模型计算训练误差
entropy_train_score = dt_entropy.score(x_train,y_train)
print("entropy指标的训练误差为：",entropy_train_score)

gini指标的训练误差为： 1.0
entropy指标的训练误差为： 1.0

3.5 计算测试误差

# 对gini指标的模型计算测试误差
gini_test_score = dt_gini.score(x_test,y_test)
print("gini指标的测试误差为：",gini_test_score)
#对entropy信息增益的模型计算训练误差
entropy_test_score = dt_entropy.score(x_test,y_test)
print("entropy指标的测试误差为：",entropy_test_score)

gini指标的测试误差为： 0.972972972972973
entropy指标的测试误差为： 0.972972972972973

3.6 绘制学习曲线

#对gini指标和entropy指标的模型绘制学习曲线，树深从1到30
test1 = []  #保存gini指标的每一次训练后的测试结果
test2 = []  #保存entropy指标的每一次训练后的测试结果
for i in range(30):
    clf_gini = DecisionTreeClassifier(max_depth=i+1,criterion='gini',random_state=30,splitter='random')
    clf_entropy = DecisionTreeClassifier(max_depth=i+1,criterion='entropy',random_state=30,splitter='random')
    clf_gini = clf_gini.fit(x_train,y_train)   #训练模型
    clf_entropy = clf_entropy.fit(x_train,y_train)
    score1 = clf_gini.score(x_train,y_train)   #计算测试结果
    score2 = clf_entropy.score(x_train,y_train)
    test1.append(score1)   #gini指标模型的每一次测试结果都存到test1中
    test2.append(score2)   #entropy指标模型的每一次测试结果都存到test2中

#画图
plt.subplot(1,2,1)
plt.plot(range(1,31),test1,color='red',label='gini')      #定义gini指标模型的图形的横轴范围在30以内，曲线颜色为红色，命名为gini
plt.xlabel('max_depth')                                   #命名x轴为max_depth
plt.ylabel('training accuracy')                           #命名y轴为training accuracy

plt.subplot(1,2,2)
plt.plot(range(1,31),test2,color='blue',label='entropy')  #定义entropy指标模型的图形的横轴范围在30以内，曲线颜色为蓝色，命名为entropy
plt.xlabel('max_depth')                                   #命名x轴为max_depth
plt.ylabel('training accuracy')                           #命名y轴为training accuracy

plt.legend()                                              #创建图例
plt.show()                                                #展示图形

png

3.7 模型评估与优化

（1）对gini指标的决策树模型进行模型评估

#计算预测值
gini_pre = dt_gini.predict(x_test)
gini_pre
#输出混淆矩阵
cm = confusion_matrix(y_test,gini_pre,labels=[0,1])
print(cm)

tp,tn,fp,fn = cm.ravel()
print(tp,tn,fp,fn)
#计算模型的准确率
accuracy = (tp + tn) / (tp + tn + fp + fn)   
#计算模型的查全率
tpr = tp / (tp + fn)
#计算模型的假正率
fpr = fp / (fp + tn)
#计算模型的精确率
tpr = tp / (tp + fp)
print("准确率为：{}%".format(accuracy*100))
print("查全率为：{}%".format(tpr*100))
print("假正率为：{}%".format(fpr*100))
print("精确率为：{}%".format(tpr*100))

[[36  1]
 [ 1 36]]
36 1 1 36
准确率为：50.0%
查全率为：97.2972972972973%
假正率为：50.0%
精确率为：97.2972972972973%

（2）对entropy指标的决策树模型进行模型评估

#计算预测值
entropy_pre = dt_entropy.predict(x_test)
entropy_pre
#输出混淆矩阵
cm = confusion_matrix(y_test,entropy_pre,labels=[0,1])
print(cm)

tp,tn,fp,fn = cm.ravel()
print(tp,tn,fp,fn)
#计算模型的准确率
accuracy = (tp + tn) / (tp + tn + fp + fn)   
#计算模型的查全率
tpr = tp / (tp + fn)
#计算模型的假正率
fpr = fp / (fp + tn)
#计算模型的精确率
tpr = tp / (tp + fp)
print("准确率为：{}%".format(accuracy*100))
print("查全率为：{}%".format(tpr*100))
print("假正率为：{}%".format(fpr*100))
print("精确率为：{}%".format(tpr*100))

[[37  0]
 [ 2 35]]
37 0 2 35
准确率为：50.0%
查全率为：94.87179487179486%
假正率为：100.0%
精确率为：94.87179487179486%

（3）用分类法对模型进行评估

#导入交叉验证工具
from sklearn.model_selection import cross_val_score
#导入用于分类额的支持向量机
from sklearn.svm import SVC
#设置SVC的核函数为linear
svc = SVC(kernel='linear')
#使用交叉验证法对SVC评分
scores = cross_val_score(svc,X,y,cv=10)
#打印结果
print("交叉验证得分：{}".format(scores))

交叉验证得分：[0.96       1.         0.96       1.         1.         0.875
 1.         1.         0.91666667 0.95833333]

plt.figure()
plt.title('决策树模型的k折交叉得分曲线图')
plt.plot(range(10),scores,'bs-')

png

posted @ 2022-08-22 16:38 湘summer 阅读(1109) 评论(0) 编辑收藏举报

刷新页面返回顶部

湘summer

机器学习——建立决策树模型，使用gini、entropy指标分别分析，并对数据进行可视化，对模型进行评估，计算假正率、准确率、查全率（阿尔及利亚森林火灾）

阿尔及利亚森林火灾数据集

1.数据加载和分析

2.数据预处理

2.1整理数据集

2.2分离出仅含特征列的部分作为 X 和仅含目标列的部分作为 Y

2. 用 LogisticRegression 建立逻辑回归模型

3. 构建决策树模型

3.1 定义一个决策树分类器并预测

3.2 构建决策树模型

3.3 数据可视化

3.4 计算训练误差

3.5 计算测试误差

3.6 绘制学习曲线

3.7 模型评估与优化

（1）对gini指标的决策树模型进行模型评估

（2）对entropy指标的决策树模型进行模型评估

（3）用分类法对模型进行评估

公告