机器学习——建立决策树模型,使用gini、entropy指标分别分析,并对数据进行可视化,对模型进行评估,计算假正率、准确率、查全率(阿尔及利亚森林火灾)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeClassifier,plot_tree import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix
data = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv') data
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
DAY | MONTH | YEAR | TEMPERATURE | RH | WS | RAIN | FFMC | DMC | DC | ISI | BUI | FWI | CLASSES | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 6 | 2012 | 29 | 57 | 18 | 0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire |
1 | 2 | 6 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1 | 3.9 | 0.4 | not fire |
2 | 3 | 6 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire |
3 | 4 | 6 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0 | 1.7 | 0 | not fire |
4 | 5 | 6 | 2012 | 27 | 77 | 16 | 0 | 64.8 | 3 | 14.2 | 1.2 | 3.9 | 0.5 | not fire |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
242 | 26 | 9 | 2012 | 30 | 65 | 14 | 0 | 85.4 | 16 | 44.5 | 4.5 | 16.9 | 6.5 | fire |
243 | 27 | 9 | 2012 | 28 | 87 | 15 | 4.4 | 41.1 | 6.5 | 8 | 0.1 | 6.2 | 0 | not fire |
244 | 28 | 9 | 2012 | 27 | 87 | 29 | 0.5 | 45.9 | 3.5 | 7.9 | 0.4 | 3.4 | 0.2 | not fire |
245 | 29 | 9 | 2012 | 24 | 54 | 18 | 0.1 | 79.7 | 4.3 | 15.2 | 1.7 | 5.1 | 0.7 | not fire |
246 | 30 | 9 | 2012 | 24 | 64 | 15 | 0.2 | 67.3 | 3.8 | 16.5 | 1.2 | 4.8 | 0.5 | not fire |
247 rows × 14 columns
</div>
2.数据预处理
2.1整理数据集
#由于数据集分为两个地区,因此我们把两个地区的数据合并为一个来分析。 #因此去掉数据中的两个地区行和一个空白行 data1 = data.iloc[0:122,:] data1 data2 = data.iloc[125:247,:] data2 data = pd.concat([data1,data2]) data #data为整理好的数据集 feature_names = data.columns.values #提取出列名 feature_names
array(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ',
'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes '],
dtype=object)
2.2分离出仅含特征列的部分作为 X 和仅含目标列的部分作为 Y
X = data.iloc[:,0:13] Y = data.iloc[:,-1] #整理目标列: s=Y.values for i in range(len(s)): s[i] = s[i].replace(' ','') print(s)
['notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'notfire' 'notfire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'notfire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'notfire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire'
'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire' 'fire'
'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire']
#使用训练好的LabelEncoder对原数据进行编码,notfire为1.fire为0 from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y=le.fit_transform(s) y
array([1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 1])
y= pd.DataFrame(y)
y
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
0 | |
---|---|
0 | 1 |
1 | 1 |
2 | 1 |
3 | 1 |
4 | 1 |
... | ... |
239 | 0 |
240 | 1 |
241 | 1 |
242 | 1 |
243 | 1 |
244 rows × 1 columns
</div>
#2.使用train_test_split函数自动随机划分训练集与测试集(70%和 30%) x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1) #x_train,x_test,y_train,y_test #查看训练集和测试集的大小 x_train.shape, x_test.shape, y_train.shape,y_test.shape
((170, 13), (74, 13), (170, 1), (74, 1))
#3.对X进行标准化处理,让模型更拟合 scaler = StandardScaler().fit(x_train) x_train = pd.DataFrame(scaler.transform(x_train)) x_train
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.693550 | -0.377923 | 0.0 | -0.338241 | 1.164128 | -0.203369 | -0.414083 | 0.508256 | 1.099453 | 1.786169 | -0.022055 | 1.423974 | 0.490624 |
1 | -0.549241 | 1.406713 | 0.0 | -0.622900 | 1.027785 | 2.027127 | 0.621124 | -1.406602 | -1.043440 | -0.895966 | -0.921917 | -1.024626 | -0.941239 |
2 | 1.221384 | -1.270241 | 0.0 | 1.085051 | 0.005213 | 0.168381 | -0.414083 | 0.679098 | 0.132734 | -0.347347 | 0.552856 | -0.045186 | 0.325409 |
3 | -0.667282 | 1.406713 | 0.0 | -0.907558 | 0.823271 | -0.203369 | 0.218544 | -1.335418 | -0.817872 | -0.900170 | -0.996906 | -0.870714 | -0.941239 |
4 | 1.457467 | 0.514395 | 0.0 | 0.515734 | 0.141556 | 0.168381 | -0.414083 | 0.792993 | 2.847602 | 3.350048 | 0.627845 | 3.207954 | 1.757272 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
165 | 0.040967 | -1.270241 | 0.0 | -0.907558 | 1.709500 | -0.203369 | -0.184037 | -2.196748 | -0.858152 | -0.904374 | -1.146882 | -0.905694 | -0.968775 |
166 | -0.431199 | 0.514395 | 0.0 | 0.800393 | -0.744673 | -0.946867 | -0.241548 | 0.216400 | 0.060230 | 0.506060 | -0.571971 | 0.255642 | -0.404291 |
167 | 0.395092 | -1.270241 | 0.0 | -0.053583 | 0.346070 | -0.575118 | 2.173935 | -0.972378 | -0.842040 | -0.900170 | -0.946913 | -0.898698 | -0.927471 |
168 | 0.749217 | 1.406713 | 0.0 | 0.231076 | 0.141556 | -0.946867 | -0.414083 | 0.757401 | 0.906109 | 1.161879 | 0.577852 | 1.074174 | 0.903661 |
169 | -0.903366 | -0.377923 | 0.0 | 0.231076 | 0.414242 | 1.283628 | -0.414083 | 0.522493 | -0.189505 | -0.025743 | 0.302895 | -0.115146 | 0.118890 |
170 rows × 13 columns
</div>
2. 用 LogisticRegression 建立逻辑回归模型
from sklearn.linear_model import LogisticRegression model_logic = LogisticRegression(max_iter=10000).fit(x_train, y_train) print(model_logic.score(x_test,y_test))
0.5
D:\Anoconda\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
3. 构建决策树模型
3.1 定义一个决策树分类器并预测
dtc = DecisionTreeClassifier() #调用该对象的训练方法,接收两个参数:训练数据集及其样本标签 dtc.fit(x_train,y_train) #调用该对象的测试方法,接收一个参数:测试数据集 y_pre = dtc.predict(x_test) #调用该对象的准确率方法,接收两个参数:测试数据集及其样本标签,返回测试集样本映射到指定分类标记上的准确率 score = dtc.score(x_test,y_test) #输出 print("测试结果:",y_pre) print("正确结果:",y_test) print("正确率:",score)
测试结果: [0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1
1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1]
正确结果: 0
67 0
243 1
206 0
122 1
89 0
.. ..
158 0
99 1
173 0
176 1
95 1
[74 rows x 1 columns]
正确率: 0.9864864864864865
3.2 构建决策树模型
#分别取分裂节点为“gini”基尼指标和“entropy”信息增益,构建决策树模型,并设置树深为5 #gini指标的决策树模型 dt_gini = DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=0) dt_gini = dt_gini.fit(x_train,y_train) #用训练集进行训练 #entropy指标的决策树模型 dt_entropy = DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=0) dt_entropy = dt_entropy.fit(x_train,y_train) #查看两者的模型参数 dt_gini,dt_entropy
(DecisionTreeClassifier(max_depth=5, random_state=0),
DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0))
3.3 数据可视化
plt.figure(figsize=(25,30)) #设置图形大小宽为25英寸,高为30英寸 #gini模型 plot_tree(dt_gini,filled = True,feature_names = feature_names) #设置自动填充颜色 #entropy模型 plot_tree(dt_entropy,filled = True,feature_names = feature_names) #设置自动填充颜色
[Text(558.0, 1467.72, 'FFMC <= 80.1\nentropy = 0.974\nsamples = 170\nvalue = [101, 69]'),
Text(279.0, 1141.56, 'entropy = 0.0\nsamples = 67\nvalue = [0, 67]'),
Text(837.0, 1141.56, 'ISI <= 3.05\nentropy = 0.138\nsamples = 103\nvalue = [101, 2]'),
Text(558.0, 815.4000000000001, 'Temperature <= 33.5\nentropy = 0.764\nsamples = 9\nvalue = [7, 2]'),
Text(279.0, 489.24, 'entropy = 0.0\nsamples = 5\nvalue = [5, 0]'),
Text(837.0, 489.24, 'DC <= 82.75\nentropy = 1.0\nsamples = 4\nvalue = [2, 2]'),
Text(558.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [0, 2]'),
Text(1116.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [2, 0]'),
Text(1116.0, 815.4000000000001, 'entropy = 0.0\nsamples = 94\nvalue = [94, 0]')]
3.4 计算训练误差
# 对gini指标的模型计算训练误差 gini_train_score = dt_gini.score(x_train,y_train) print("gini指标的训练误差为:",gini_train_score) #对entropy信息增益的模型计算训练误差 entropy_train_score = dt_entropy.score(x_train,y_train) print("entropy指标的训练误差为:",entropy_train_score)
gini指标的训练误差为: 1.0
entropy指标的训练误差为: 1.0
3.5 计算测试误差
# 对gini指标的模型计算测试误差 gini_test_score = dt_gini.score(x_test,y_test) print("gini指标的测试误差为:",gini_test_score) #对entropy信息增益的模型计算训练误差 entropy_test_score = dt_entropy.score(x_test,y_test) print("entropy指标的测试误差为:",entropy_test_score)
gini指标的测试误差为: 0.972972972972973
entropy指标的测试误差为: 0.972972972972973
3.6 绘制学习曲线
#对gini指标和entropy指标的模型绘制学习曲线,树深从1到30 test1 = [] #保存gini指标的每一次训练后的测试结果 test2 = [] #保存entropy指标的每一次训练后的测试结果 for i in range(30): clf_gini = DecisionTreeClassifier(max_depth=i+1,criterion='gini',random_state=30,splitter='random') clf_entropy = DecisionTreeClassifier(max_depth=i+1,criterion='entropy',random_state=30,splitter='random') clf_gini = clf_gini.fit(x_train,y_train) #训练模型 clf_entropy = clf_entropy.fit(x_train,y_train) score1 = clf_gini.score(x_train,y_train) #计算测试结果 score2 = clf_entropy.score(x_train,y_train) test1.append(score1) #gini指标模型的每一次测试结果都存到test1中 test2.append(score2) #entropy指标模型的每一次测试结果都存到test2中 #画图 plt.subplot(1,2,1) plt.plot(range(1,31),test1,color='red',label='gini') #定义gini指标模型的图形的横轴范围在30以内,曲线颜色为红色,命名为gini plt.xlabel('max_depth') #命名x轴为max_depth plt.ylabel('training accuracy') #命名y轴为training accuracy plt.subplot(1,2,2) plt.plot(range(1,31),test2,color='blue',label='entropy') #定义entropy指标模型的图形的横轴范围在30以内,曲线颜色为蓝色,命名为entropy plt.xlabel('max_depth') #命名x轴为max_depth plt.ylabel('training accuracy') #命名y轴为training accuracy plt.legend() #创建图例 plt.show() #展示图形
3.7 模型评估与优化
(1)对gini指标的决策树模型进行模型评估
#计算预测值 gini_pre = dt_gini.predict(x_test) gini_pre #输出混淆矩阵 cm = confusion_matrix(y_test,gini_pre,labels=[0,1]) print(cm) tp,tn,fp,fn = cm.ravel() print(tp,tn,fp,fn) #计算模型的准确率 accuracy = (tp + tn) / (tp + tn + fp + fn) #计算模型的查全率 tpr = tp / (tp + fn) #计算模型的假正率 fpr = fp / (fp + tn) #计算模型的精确率 tpr = tp / (tp + fp) print("准确率为:{}%".format(accuracy*100)) print("查全率为:{}%".format(tpr*100)) print("假正率为:{}%".format(fpr*100)) print("精确率为:{}%".format(tpr*100))
[[36 1]
[ 1 36]]
36 1 1 36
准确率为:50.0%
查全率为:97.2972972972973%
假正率为:50.0%
精确率为:97.2972972972973%
(2)对entropy指标的决策树模型进行模型评估
#计算预测值 entropy_pre = dt_entropy.predict(x_test) entropy_pre #输出混淆矩阵 cm = confusion_matrix(y_test,entropy_pre,labels=[0,1]) print(cm) tp,tn,fp,fn = cm.ravel() print(tp,tn,fp,fn) #计算模型的准确率 accuracy = (tp + tn) / (tp + tn + fp + fn) #计算模型的查全率 tpr = tp / (tp + fn) #计算模型的假正率 fpr = fp / (fp + tn) #计算模型的精确率 tpr = tp / (tp + fp) print("准确率为:{}%".format(accuracy*100)) print("查全率为:{}%".format(tpr*100)) print("假正率为:{}%".format(fpr*100)) print("精确率为:{}%".format(tpr*100))
[[37 0]
[ 2 35]]
37 0 2 35
准确率为:50.0%
查全率为:94.87179487179486%
假正率为:100.0%
精确率为:94.87179487179486%
(3)用分类法对模型进行评估
#导入交叉验证工具 from sklearn.model_selection import cross_val_score #导入用于分类额的支持向量机 from sklearn.svm import SVC #设置SVC的核函数为linear svc = SVC(kernel='linear') #使用交叉验证法对SVC评分 scores = cross_val_score(svc,X,y,cv=10) #打印结果 print("交叉验证得分:{}".format(scores))
交叉验证得分:[0.96 1. 0.96 1. 1. 0.875
1. 1. 0.91666667 0.95833333]
plt.figure() plt.title('决策树模型的k折交叉得分曲线图') plt.plot(range(10),scores,'bs-')