机器学习——建立决策树模型,使用gini、entropy指标分别分析,并对数据进行可视化,对模型进行评估,计算假正率、准确率、查全率(阿尔及利亚森林火灾)

 

阿尔及利亚森林火灾数据集

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier,plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

1.数据加载和分析

data = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv')
data

 

 
.dataframe tbody tr th {
  vertical-align: top;
}

.dataframe thead th {
  text-align: right;
}

</style>

 DAYMONTHYEARTEMPERATURERHWSRAINFFMCDMCDCISIBUIFWICLASSES
0 1 6 2012 29 57 18 0 65.7 3.4 7.6 1.3 3.4 0.5 not fire
1 2 6 2012 29 61 13 1.3 64.4 4.1 7.6 1 3.9 0.4 not fire
2 3 6 2012 26 82 22 13.1 47.1 2.5 7.1 0.3 2.7 0.1 not fire
3 4 6 2012 25 89 13 2.5 28.6 1.3 6.9 0 1.7 0 not fire
4 5 6 2012 27 77 16 0 64.8 3 14.2 1.2 3.9 0.5 not fire
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
242 26 9 2012 30 65 14 0 85.4 16 44.5 4.5 16.9 6.5 fire
243 27 9 2012 28 87 15 4.4 41.1 6.5 8 0.1 6.2 0 not fire
244 28 9 2012 27 87 29 0.5 45.9 3.5 7.9 0.4 3.4 0.2 not fire
245 29 9 2012 24 54 18 0.1 79.7 4.3 15.2 1.7 5.1 0.7 not fire
246 30 9 2012 24 64 15 0.2 67.3 3.8 16.5 1.2 4.8 0.5 not fire

247 rows × 14 columns

</div>

 

2.数据预处理

2.1整理数据集

#由于数据集分为两个地区,因此我们把两个地区的数据合并为一个来分析。
#因此去掉数据中的两个地区行和一个空白行
data1 = data.iloc[0:122,:]
data1
data2 = data.iloc[125:247,:]
data2
data = pd.concat([data1,data2])
data #data为整理好的数据集
feature_names = data.columns.values #提取出列名
feature_names

 

array(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ',
      'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes '],
    dtype=object)

 

2.2分离出仅含特征列的部分作为 X 和仅含目标列的部分作为 Y

X = data.iloc[:,0:13]
Y = data.iloc[:,-1]
#整理目标列:
s=Y.values
for i in range(len(s)):
    s[i] = s[i].replace(' ','')
print(s)

 

['notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'notfire' 'notfire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'notfire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'notfire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire'
'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire' 'fire'
'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire']

 

#使用训练好的LabelEncoder对原数据进行编码,notfire为1.fire为0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(s)            
y

 

 

array([1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
      0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
      1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
      0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
      0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
      0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
      0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
      1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
      0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
      1, 1])

 

y= pd.DataFrame(y)
y

 

 

 
.dataframe tbody tr th {
  vertical-align: top;
}

.dataframe thead th {
  text-align: right;
}

</style>

 0
0 1
1 1
2 1
3 1
4 1
... ...
239 0
240 1
241 1
242 1
243 1

244 rows × 1 columns

</div>

 

#2.使用train_test_split函数自动随机划分训练集与测试集(70%和 30%)
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)
#x_train,x_test,y_train,y_test
#查看训练集和测试集的大小
x_train.shape, x_test.shape, y_train.shape,y_test.shape

 

 

((170, 13), (74, 13), (170, 1), (74, 1))

 

#3.对X进行标准化处理,让模型更拟合
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train))
x_train

 

 

 
.dataframe tbody tr th {
  vertical-align: top;
}

.dataframe thead th {
  text-align: right;
}

</style>

 0123456789101112
0 1.693550 -0.377923 0.0 -0.338241 1.164128 -0.203369 -0.414083 0.508256 1.099453 1.786169 -0.022055 1.423974 0.490624
1 -0.549241 1.406713 0.0 -0.622900 1.027785 2.027127 0.621124 -1.406602 -1.043440 -0.895966 -0.921917 -1.024626 -0.941239
2 1.221384 -1.270241 0.0 1.085051 0.005213 0.168381 -0.414083 0.679098 0.132734 -0.347347 0.552856 -0.045186 0.325409
3 -0.667282 1.406713 0.0 -0.907558 0.823271 -0.203369 0.218544 -1.335418 -0.817872 -0.900170 -0.996906 -0.870714 -0.941239
4 1.457467 0.514395 0.0 0.515734 0.141556 0.168381 -0.414083 0.792993 2.847602 3.350048 0.627845 3.207954 1.757272
... ... ... ... ... ... ... ... ... ... ... ... ... ...
165 0.040967 -1.270241 0.0 -0.907558 1.709500 -0.203369 -0.184037 -2.196748 -0.858152 -0.904374 -1.146882 -0.905694 -0.968775
166 -0.431199 0.514395 0.0 0.800393 -0.744673 -0.946867 -0.241548 0.216400 0.060230 0.506060 -0.571971 0.255642 -0.404291
167 0.395092 -1.270241 0.0 -0.053583 0.346070 -0.575118 2.173935 -0.972378 -0.842040 -0.900170 -0.946913 -0.898698 -0.927471
168 0.749217 1.406713 0.0 0.231076 0.141556 -0.946867 -0.414083 0.757401 0.906109 1.161879 0.577852 1.074174 0.903661
169 -0.903366 -0.377923 0.0 0.231076 0.414242 1.283628 -0.414083 0.522493 -0.189505 -0.025743 0.302895 -0.115146 0.118890

170 rows × 13 columns

</div>

 

2. 用 LogisticRegression 建立逻辑回归模型

from sklearn.linear_model import LogisticRegression
model_logic = LogisticRegression(max_iter=10000).fit(x_train, y_train)
print(model_logic.score(x_test,y_test))

 

0.5
D:\Anoconda\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)

3. 构建决策树模型

3.1 定义一个决策树分类器并预测

 

dtc = DecisionTreeClassifier()
#调用该对象的训练方法,接收两个参数:训练数据集及其样本标签
dtc.fit(x_train,y_train)
#调用该对象的测试方法,接收一个参数:测试数据集
y_pre = dtc.predict(x_test)
#调用该对象的准确率方法,接收两个参数:测试数据集及其样本标签,返回测试集样本映射到指定分类标记上的准确率
score = dtc.score(x_test,y_test)
#输出
print("测试结果:",y_pre)  
print("正确结果:",y_test)
print("正确率:",score)

 

测试结果: [0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1
1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1]
正确结果:     0
67   0
243 1
206 0
122 1
89   0
.. ..
158 0
99   1
173 0
176 1
95   1

[74 rows x 1 columns]
正确率: 0.9864864864864865

3.2 构建决策树模型

#分别取分裂节点为“gini”基尼指标和“entropy”信息增益,构建决策树模型,并设置树深为5
#gini指标的决策树模型
dt_gini = DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=0)
dt_gini = dt_gini.fit(x_train,y_train)   #用训练集进行训练
#entropy指标的决策树模型
dt_entropy = DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=0)
dt_entropy = dt_entropy.fit(x_train,y_train)
#查看两者的模型参数
dt_gini,dt_entropy

 

 

(DecisionTreeClassifier(max_depth=5, random_state=0),
DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0))

 

3.3 数据可视化

plt.figure(figsize=(25,30))   #设置图形大小宽为25英寸,高为30英寸
#gini模型
plot_tree(dt_gini,filled = True,feature_names = feature_names)    #设置自动填充颜色
#entropy模型
plot_tree(dt_entropy,filled = True,feature_names = feature_names)    #设置自动填充颜色

 

 

[Text(558.0, 1467.72, 'FFMC <= 80.1\nentropy = 0.974\nsamples = 170\nvalue = [101, 69]'),
Text(279.0, 1141.56, 'entropy = 0.0\nsamples = 67\nvalue = [0, 67]'),
Text(837.0, 1141.56, 'ISI <= 3.05\nentropy = 0.138\nsamples = 103\nvalue = [101, 2]'),
Text(558.0, 815.4000000000001, 'Temperature <= 33.5\nentropy = 0.764\nsamples = 9\nvalue = [7, 2]'),
Text(279.0, 489.24, 'entropy = 0.0\nsamples = 5\nvalue = [5, 0]'),
Text(837.0, 489.24, 'DC <= 82.75\nentropy = 1.0\nsamples = 4\nvalue = [2, 2]'),
Text(558.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [0, 2]'),
Text(1116.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [2, 0]'),
Text(1116.0, 815.4000000000001, 'entropy = 0.0\nsamples = 94\nvalue = [94, 0]')]

 

png

3.4 计算训练误差

# 对gini指标的模型计算训练误差
gini_train_score = dt_gini.score(x_train,y_train)
print("gini指标的训练误差为:",gini_train_score)
#对entropy信息增益的模型计算训练误差
entropy_train_score = dt_entropy.score(x_train,y_train)
print("entropy指标的训练误差为:",entropy_train_score)

 

gini指标的训练误差为: 1.0
entropy指标的训练误差为: 1.0

3.5 计算测试误差

# 对gini指标的模型计算测试误差
gini_test_score = dt_gini.score(x_test,y_test)
print("gini指标的测试误差为:",gini_test_score)
#对entropy信息增益的模型计算训练误差
entropy_test_score = dt_entropy.score(x_test,y_test)
print("entropy指标的测试误差为:",entropy_test_score)

 

gini指标的测试误差为: 0.972972972972973
entropy指标的测试误差为: 0.972972972972973

3.6 绘制学习曲线

#对gini指标和entropy指标的模型绘制学习曲线,树深从1到30
test1 = []  #保存gini指标的每一次训练后的测试结果
test2 = []  #保存entropy指标的每一次训练后的测试结果
for i in range(30):
    clf_gini = DecisionTreeClassifier(max_depth=i+1,criterion='gini',random_state=30,splitter='random')
    clf_entropy = DecisionTreeClassifier(max_depth=i+1,criterion='entropy',random_state=30,splitter='random')
    clf_gini = clf_gini.fit(x_train,y_train)   #训练模型
    clf_entropy = clf_entropy.fit(x_train,y_train)
    score1 = clf_gini.score(x_train,y_train)   #计算测试结果
    score2 = clf_entropy.score(x_train,y_train)
    test1.append(score1)   #gini指标模型的每一次测试结果都存到test1中
    test2.append(score2)   #entropy指标模型的每一次测试结果都存到test2中
#画图
plt.subplot(1,2,1)
plt.plot(range(1,31),test1,color='red',label='gini')      #定义gini指标模型的图形的横轴范围在30以内,曲线颜色为红色,命名为gini
plt.xlabel('max_depth')                                   #命名x轴为max_depth
plt.ylabel('training accuracy')                           #命名y轴为training accuracy
​
plt.subplot(1,2,2)
plt.plot(range(1,31),test2,color='blue',label='entropy')  #定义entropy指标模型的图形的横轴范围在30以内,曲线颜色为蓝色,命名为entropy
plt.xlabel('max_depth')                                   #命名x轴为max_depth
plt.ylabel('training accuracy')                           #命名y轴为training accuracy
​
plt.legend()                                              #创建图例
plt.show()                                                #展示图形

 

png

3.7 模型评估与优化

(1)对gini指标的决策树模型进行模型评估

#计算预测值
gini_pre = dt_gini.predict(x_test)
gini_pre
#输出混淆矩阵
cm = confusion_matrix(y_test,gini_pre,labels=[0,1])
print(cm)
​
tp,tn,fp,fn = cm.ravel()
print(tp,tn,fp,fn)
#计算模型的准确率
accuracy = (tp + tn) / (tp + tn + fp + fn)   
#计算模型的查全率
tpr = tp / (tp + fn)
#计算模型的假正率
fpr = fp / (fp + tn)
#计算模型的精确率
tpr = tp / (tp + fp)
print("准确率为:{}%".format(accuracy*100))
print("查全率为:{}%".format(tpr*100))
print("假正率为:{}%".format(fpr*100))
print("精确率为:{}%".format(tpr*100))

 

[[36  1]
[ 1 36]]
36 1 1 36
准确率为:50.0%
查全率为:97.2972972972973%
假正率为:50.0%
精确率为:97.2972972972973%

(2)对entropy指标的决策树模型进行模型评估

#计算预测值
entropy_pre = dt_entropy.predict(x_test)
entropy_pre
#输出混淆矩阵
cm = confusion_matrix(y_test,entropy_pre,labels=[0,1])
print(cm)
​
tp,tn,fp,fn = cm.ravel()
print(tp,tn,fp,fn)
#计算模型的准确率
accuracy = (tp + tn) / (tp + tn + fp + fn)   
#计算模型的查全率
tpr = tp / (tp + fn)
#计算模型的假正率
fpr = fp / (fp + tn)
#计算模型的精确率
tpr = tp / (tp + fp)
print("准确率为:{}%".format(accuracy*100))
print("查全率为:{}%".format(tpr*100))
print("假正率为:{}%".format(fpr*100))
print("精确率为:{}%".format(tpr*100))

 

[[37  0]
[ 2 35]]
37 0 2 35
准确率为:50.0%
查全率为:94.87179487179486%
假正率为:100.0%
精确率为:94.87179487179486%

(3)用分类法对模型进行评估

#导入交叉验证工具
from sklearn.model_selection import cross_val_score
#导入用于分类额的支持向量机
from sklearn.svm import SVC
#设置SVC的核函数为linear
svc = SVC(kernel='linear')
#使用交叉验证法对SVC评分
scores = cross_val_score(svc,X,y,cv=10)
#打印结果
print("交叉验证得分:{}".format(scores))

 

交叉验证得分:[0.96       1.         0.96       1.         1.         0.875
1.         1.         0.91666667 0.95833333]

 

plt.figure()
plt.title('决策树模型的k折交叉得分曲线图')
plt.plot(range(10),scores,'bs-')

 

 

png

 

posted @ 2022-08-22 16:38  湘summer  阅读(1109)  评论(0编辑  收藏  举报