python随机森林进行气温预测

天气最高温度

我们要完成三个任务

随机森林建模 --》 选择特征 - 》 增加数据量和特征个数 --》 找到最优的参数

掌握机器学习里面2种经典的参数调节方法

读数据

import pandas as pd

data = pd.read_csv("temps.csv")
data.head()
year month day week temp_2 temp_1 average actual friend
0 2019 1 1 Fri 45 45 45.6 45 29
1 2019 1 2 Sat 44 45 45.7 44 61
2 2019 1 3 Sun 45 44 45.8 41 56
3 2019 1 4 Mon 44 41 45.9 40 53
4 2019 1 5 Tues 41 40 46.0 44 41
## 看看数据多少维度
print(data.shape)
(348, 9)
data.describe()
year month day temp_2 temp_1 average actual friend
count 348.0 348.000000 348.000000 348.000000 348.000000 348.000000 348.000000 348.000000
mean 2019.0 6.477011 15.514368 62.652299 62.701149 59.760632 62.543103 60.034483
std 0.0 3.498380 8.772982 12.165398 12.120542 10.527306 11.794146 15.626179
min 2019.0 1.000000 1.000000 35.000000 35.000000 45.100000 35.000000 28.000000
25% 2019.0 3.000000 8.000000 54.000000 54.000000 49.975000 54.000000 47.750000
50% 2019.0 6.000000 15.000000 62.500000 62.500000 58.200000 62.500000 60.000000
75% 2019.0 10.000000 23.000000 71.000000 71.000000 69.025000 71.000000 71.000000
max 2019.0 12.000000 31.000000 117.000000 117.000000 77.400000 92.000000 95.000000

时间处理函数

import datetime

years = data['year']
months = data['month']
days = data['day']

dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year , month, day in zip (years,months,days)]

dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]

dates[:5]
[datetime.datetime(2019, 1, 1, 0, 0),
 datetime.datetime(2019, 1, 2, 0, 0),
 datetime.datetime(2019, 1, 3, 0, 0),
 datetime.datetime(2019, 1, 4, 0, 0),
 datetime.datetime(2019, 1, 5, 0, 0)]

数据展示

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight') # 绘图风格

展示四个指标:分别为最高气温的标签值、前天、昨天、朋友预测的气温最高值,四个图。

fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2,figsize=(10,10))
fig.autofmt_xdate(rotation=45)
# 最高气温的标签值
ax1.plot(dates,data['actual'])
ax1.set_xlabel('');ax1.set_ylabel('Temperature');ax1.set_title('Max Temp')
# 昨天的最高温度值
ax2.plot(dates,data['temp_1'])
ax2.set_xlabel('');ax2.set_ylabel('Temperature');ax2.set_title('Yesterday Max Temp')
# 前天的最高温度值
ax3.plot(dates,data['temp_2'])
ax3.set_xlabel('');ax3.set_ylabel('Temperature');ax3.set_title('Two Days Prior Max Temp')
# 朋友预测的最高温度值
ax4.plot(dates,data['friend'])
ax4.set_xlabel('');ax4.set_ylabel('Temperature');ax4.set_title('Friend Forcast')
plt.tight_layout(pad=2)


数据预处理,比如 周几,这个要转成 计算机可识别的数字

# 独热编码
data = pd.get_dummies(data) #自动转换,自动添加后缀
data.head(5)
year month day temp_2 temp_1 average actual friend week_Fri week_Mon week_Sat week_Sun week_Thurs week_Tues week_Wed
0 2019 1 1 45 45 45.6 45 29 1 0 0 0 0 0 0
1 2019 1 2 44 45 45.7 44 61 0 0 1 0 0 0 0
2 2019 1 3 45 44 45.8 41 56 0 0 0 1 0 0 0
3 2019 1 4 44 41 45.9 40 53 0 1 0 0 0 0 0
4 2019 1 5 41 40 46.0 44 41 0 0 0 0 0 1 0

# 数据与标签
import numpy as np
# 标签
labels = np.array(data['actual'])
# 特征中去除标签
data = data.drop('actual',axis=1) # 按照列去掉
# 名字单独保留
feature_list = list(data.columns)
# 转换为合适的格式
features = np.array(data)

# 数据集切分
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features,labels,test_size=0.25,random_state=42)
print('训练集特征:',train_features.shape)
print('训练集标签:',train_labels.shape)
print('测试集标签:',test_features.shape)
print('测试机标签:',test_labels.shape)


训练集特征: (261, 14)
训练集标签: (261,)
测试集标签: (87, 14)
测试机标签: (87,)

### 构建随机森林模型,采用 mape 平均绝对误差百分比
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1000,random_state=42)
rf.fit(train_features,train_labels)
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
mape = 100 * (errors / test_labels)
print('MAPE:',np.mean(mape))

MAPE: 6.016378550202468

from sklearn.tree import export_graphviz
import pydot
import os

tree = rf.estimators_[5]
export_graphviz(tree,out_file="tree.dot",feature_names=feature_list,rounded=True,precision=1)
(graph,) = pydot.graph_from_dot_file('./tree.dot')
graph.write_png('tree.png')

# 限制树模型
rf_small = RandomForestRegressor(n_estimators=10,max_depth=3,random_state=42)
rf_small.fit(train_features,train_labels)
tree_small = rf_small.estimators_[5]
export_graphviz(tree_small,out_file='small_tree.dot',feature_names=feature_list,rounded=True,precision=1)
(graph,) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')

# 决策树特征重要性
importances = list(rf.feature_importances_)
# 格式转换
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list,importances)]
feature_importances = sorted(feature_importances,key=lambda x:x[1],reverse=True)
# 打印
[print('Variable:{:20} importance: {}'.format(*pair)) for pair in feature_importances]

Variable:temp_1               importance: 0.69
Variable:average              importance: 0.2
Variable:day                  importance: 0.03
Variable:friend               importance: 0.03
Variable:temp_2               importance: 0.02
Variable:month                importance: 0.01
Variable:year                 importance: 0.0
Variable:week_Fri             importance: 0.0
Variable:week_Mon             importance: 0.0
Variable:week_Sat             importance: 0.0
Variable:week_Sun             importance: 0.0
Variable:week_Thurs           importance: 0.0
Variable:week_Tues            importance: 0.0
Variable:week_Wed             importance: 0.0





[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]
# 绘制为直方图
x_values = list(range(len(importances)))
plt.bar(x_values,importances,orientation='vertical')
plt.xticks(x_values,feature_list,rotation='vertical')
plt.ylabel('Importance');plt.xlabel('Variable');plt.title('Variable Importances')

Text(0.5, 1.0, 'Variable Importances')

# 尝试使用最重要的两个特征
rf_most_important = RandomForestRegressor(n_estimators=1000,random_state=42)
# 最重要特征
important_indices = [feature_list.index('temp_1'),feature_list.index('average')]
train_important = train_features[:,important_indices]
test_important = test_features[:,important_indices]
# 重新训练模型
rf_most_important.fit(train_important,train_labels)
# 预测结果
predictions = rf_most_important.predict(test_important)
errors = abs(predictions-test_labels)
# 评估结果,保留两位小数
print('Mean Absolute Error:',round(np.mean(errors),2),'%')
mape = np.mean(100*(errors/test_labels))
print('mape:',mape)

Mean Absolute Error: 3.92 %
mape: 6.243108595734665

发现,mape的值从6.0上升到6.2,并没有下降,说明不能只选择最重要的特征

# 日期
months = features[:,feature_list.index('month')]
days = features[:,feature_list.index('day')]
years = features[:,feature_list.index('year')]
# 转换日期
dates = [str(int(year))+'-'+str(int(month))+'-'+str(int(day)) for year, month, day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
# 创建表格保存日期和其对应的标签数据
true_data = pd.DataFrame(data={'date':dates,'actual':labels})
# 另一个表格表示日期和对应预测值
months = test_features[:,feature_list.index('month')]
days = test_features[:,feature_list.index('day')]
years = test_features[:,feature_list.index('year')]
test_dates = [str(int(year))+'-'+str(int(month))+'-'+str(int(day)) for year,month,day in zip(years,months,days)]
test_dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in test_dates]
predictions_data = pd.DataFrame(data = {'date':test_dates,'prediction':predictions})
# 真实值
plt.plot(true_data['date'],true_data['actual'],'b-',label='actual')
# 预测值
plt.plot(predictions_data['date'],predictions_data['prediction'],'ro',label='prediction')
plt.xticks(rotation='60')
plt.legend()

#图名
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)');  plt.title('Acture and Predicted Values');

### 深入数据分析
## 如果可以利用的数据量增大,会对结果产生什么影响呢
## 加入的新特征会改进模型效果吗,此时的时间效率又怎么样

数据增多,采用新的数据集

import pandas as pd
features = pd.read_csv('temps_extended.csv')
features.head(5)
year month day weekday ws_1 prcp_1 snwd_1 temp_2 temp_1 average actual friend
0 2011 1 1 Sat 4.92 0.00 0 36 37 45.6 40 40
1 2011 1 2 Sun 5.37 0.00 0 37 40 45.7 39 50
2 2011 1 3 Mon 6.26 0.00 0 40 39 45.8 42 42
3 2011 1 4 Tues 5.59 0.00 0 39 42 45.9 38 59
4 2011 1 5 Wed 3.80 0.03 0 42 38 46.0 45 39
print('数据规模',features.shape)
数据规模 (2191, 12)
## 时间转化,用标准时间格式方便后续工作
import datetime
years = features['year']
months = features['month']
days = features['day']
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d') for date in dates]
dates[:5]

[datetime.datetime(2011, 1, 1, 0, 0),
 datetime.datetime(2011, 1, 2, 0, 0),
 datetime.datetime(2011, 1, 3, 0, 0),
 datetime.datetime(2011, 1, 4, 0, 0),
 datetime.datetime(2011, 1, 5, 0, 0)]
# 对新特征进行可视化展示
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
fig.autofmt_xdate(rotation=45)
# 平均最高气温
ax1.plot(dates,features['average'])
ax1.set_xlabel('');ax1.set_ylabel('Tempertature (F)');ax1.set_title('Historical Avg Max Temp')
# 风速
ax2.plot(dates,features['ws_1'],'r-')
ax2.set_xlabel('');ax2.set_ylabel('Wind Speed (mph))');ax2.set_title('Prior Wind Speed')
# 降水
ax3.plot(dates,features['prcp_1'],'r-')
ax3.set_xlabel('Date');ax3.set_ylabel('Precipitation (in)');ax3.set_title('Prior Precipitation')
# 积雪
ax4.plot(dates,features['snwd_1'],'ro')
ax4.set_xlabel('Date');ax4.set_ylabel('Snow Depth (in)');ax4.set_title('Prior Snow Depth')

plt.tight_layout(pad=2)

天气变换与季节因素有关,然而数据集中并没有体现季节的特征,可以自己创建

# 季节变量
seasons = []
for month in features['month']:
    if month in [1,2,12]:
        seasons.append('winter')
    elif month in [3,4,5]:
        seasons.append('spring')
    elif month in [6,7,8]:
        seasons.append('summer')
    elif month in [9,10,11]:
        seasons.append('fall')
reduced_features = features[['temp_1','prcp_1','average','actual']]
reduced_features['season'] = seasons

C:\Users\Owner\AppData\Local\Temp\ipykernel_15292\2969630295.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_features['season'] = seasons
import seaborn as sns
sns.set(style='ticks',color_codes=True)
# 主题
palette = sns.xkcd_palette(['dark blue','dark green','gold','orange'])
# pairplot绘图
sns.pairplot(reduced_features,hue='season',diag_kind='kde',palette=palette,plot_kws=dict(alpha=0.7),diag_kws=dict(shade=True))

<seaborn.axisgrid.PairGrid at 0x23f1d1c4370>

# 独热编码
features = pd.get_dummies(features)
# 提取特征和标签
labels = features['actual']
features = features.drop('actual',axis=1)
# 特征名字留着备用
feature_list = list(features.columns)

# 转换为所需格式
import numpy as np
features = np.array(features)
labels = np.array(labels)
# 数据集切分
from sklearn.model_selection import train_test_split

划分新的数据集

train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.25,random_state=0)
print("训练集特征:",train_features.shape)
print("训练集标签:",train_labels.shape)
print("测试集特征:",test_features.shape)
print("测试集标签:",test_labels.shape)

训练集特征: (1643, 17)
训练集标签: (1643,)
测试集特征: (548, 17)
测试集标签: (548,)

import pandas as pd
import numpy as np
# 统一特征
original_feature_indices = [feature_list.index(feature) for feature in feature_list if feature not in ['ws_1','prcp_1','snwd_1']]
# 重新读取老数据
original_features = pd.read_csv('temps.csv')
original_features = pd.get_dummies(original_features)
# 数据标签转换
original_labels = np.array(original_features['actual'])
original_features = original_features.drop('actual',axis=1)
original_feature_list = list(original_features.columns)
original_features = np.array(original_features)

# 老数据集切分
from sklearn.model_selection import train_test_split
original_train_features,original_test_features,original_train_labels,original_test_labels = train_test_split(original_features,original_labels,test_size=0.25,random_state=42)
# 数据建模
from sklearn.ensemble import RandomForestRegressor
# 同样参数和随机种子
rf = RandomForestRegressor(n_estimators=100,random_state=0)
# 老数据集
rf.fit(original_train_features,original_train_labels)
# 统一使用一个测试集,为了公平
predictions = rf.predict(test_features[:,original_feature_indices])
errors = abs(predictions-test_labels)
print('老数据集平均温度误差:',round(np.mean(errors),2),'°')
mape = 100 *(errors/test_labels)
# 为了观察设定准确率
accuracy = 100 -np.mean(mape)
print('Accuracy:',round(accuracy,2),'%')

老数据集平均温度误差: 4.68 °
Accuracy: 92.19 %

from sklearn.ensemble import RandomForestRegressor
# 保证标签一致 剔除新的特征
original_train_changeed_features = train_features[:,original_feature_indices]
original_test_changed_features = test_features[:,original_feature_indices]
rf = RandomForestRegressor(n_estimators=100,random_state=0)
rf.fit(original_train_changeed_features,train_labels)
# 预测
baseline_predictions = rf.predict(original_test_changed_features)
# 结果
baseline_errors = abs(baseline_predictions-test_labels)
print('新数据集平均温度误差:',round(np.mean(baseline_errors),2),'%')
baseline_mape = 100 * np.mean(baseline_errors/test_labels)
# 准确率
baseline_accuracy = 100 - baseline_mape
print('Accuracy:',round(baseline_accuracy,2),'%')

新数据集平均温度误差: 4.2 %
Accuracy: 93.12 %

数据增多以后,相同的特征,结果加强了



from sklearn.ensemble import RandomForestRegressor
rf_exp = RandomForestRegressor(n_estimators=100,random_state=0)
rf_exp.fit(train_features,train_labels)
# 同一测试集
predictions = rf_exp.predict(test_features)
# 评估
errors = abs(predictions - test_labels)
print('平均温度误差:',round(np.mean(errors),2),"%")
mape = np.mean(100*(errors/test_labels))
improvement_baseline = 100 * abs(mape-baseline_mape) / baseline_mape
print('特征增多以后模型效果变化:',round(improvement_baseline,2),'%')
# 准确率
accuracy = 100 - mape
print('Accuracy:',round(accuracy,2),'%')

平均温度误差: 4.05 %
特征增多以后模型效果变化: 3.34 %
Accuracy: 93.35 %

重要特征

importances = list(rf_exp.feature_importances_)
# 名字和数值拼接在一起
feature_importances = [(feature,round(importance,2)) for feature,importance in zip(feature_list,importances)]
# 排序
feature_importances = sorted(feature_importances,key=lambda x:x[1],reverse=True)
# 打印结果
[print('Variable:{:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable:temp_1               Importance: 0.85
Variable:average              Importance: 0.05
Variable:ws_1                 Importance: 0.02
Variable:friend               Importance: 0.02
Variable:year                 Importance: 0.01
Variable:month                Importance: 0.01
Variable:day                  Importance: 0.01
Variable:prcp_1               Importance: 0.01
Variable:temp_2               Importance: 0.01
Variable:snwd_1               Importance: 0.0
Variable:weekday_Fri          Importance: 0.0
Variable:weekday_Mon          Importance: 0.0
Variable:weekday_Sat          Importance: 0.0
Variable:weekday_Sun          Importance: 0.0
Variable:weekday_Thurs        Importance: 0.0
Variable:weekday_Tues         Importance: 0.0
Variable:weekday_Wed          Importance: 0.0





[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]
# 可视化重要指标
plt.style.use('fivethirtyeight')
x_values = list(range(len(importances)))
plt.bar(x_values,importances,orientation="vertical",color="r",edgecolor="k",linewidth=1.2)
plt.xticks(x_values,feature_list,rotation='vertical')
plt.ylabel('Importance')
plt.xlabel('Variable')
plt.title('Variable Importances')

Text(0.5, 1.0, 'Variable Importances')


sorted_importances = [importance[1] for importance  in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# 累计重要性
cumulative_importances = np.cumsum(sorted_importances)
# 绘制折线图
plt.plot(x_values,cumulative_importances,'g-')
plt.hlines(y=0.95,xmin=0,xmax=len(sorted_importances),color='r',linestyles='dashed')
plt.xticks(x_values,sorted_features,rotation='vertical')
plt.xlabel('Variable');plt.ylabel('Cumulative Importance')
plt.title('Cumulative Importances')

Text(0.5, 1.0, 'Cumulative Importances')

如果只使用这5个特征建模,观察结果

important_feature_names = [feature[0] for feature in feature_importances[0:5]]
# 名字
important_indices = [feature_list.index(feature) for feature in important_feature_names]
# 训练集
important_train_features = train_features[:,important_indices]
important_test_features = test_features[:,important_indices]
# 数据维度
print("important train features shape:",important_train_features.shape)
print("important test features shape:",important_test_features.shape)
# 训练模型
rf_exp.fit(important_train_features,train_labels)
# 同样的测试集
predictions = rf_exp.predict(important_test_features)
# 评估
errors = abs(predictions-test_labels)
print('平均温度误差:',round(np.mean(errors),2),"°")
mape = 100*(errors/test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:',round(accuracy,2),"%")

important train features shape: (1643, 5)
important test features shape: (548, 5)
平均温度误差: 4.11 °
Accuracy: 93.28 %

虽然没有提升效率,那么观察一下在模型时间效率上面有没有提高???

import time
all_features_time = []
for _ in range(10):
    start_time = time.time()
    rf_exp.fit(train_features,train_labels)
    all_features_predictions = rf_exp.predict(test_features)
    end_time = time.time()
    all_features_time.append(end_time-start_time)
    
all_features_time = np.mean(all_features_time)
print("使用所有特征与测试的平均时间消耗:",round(all_features_time,2),'s')

使用所有特征与测试的平均时间消耗: 0.71 s
# 只选用重要特征训练时
reduced_features_time = []
for _ in range(10):
    start_time = time.time()
    rf_exp.fit(important_train_features,train_labels)
    reduced_features_predictions = rf_exp.predict(important_test_features)
    end_time = time.time()
    reduced_features_time.append(end_time-start_time)
    
reduced_features_time = np.mean(reduced_features_time)
print("使用重要特征与测试的平均时间消耗:",round(reduced_features_time,2),'s')

使用重要特征与测试的平均时间消耗: 0.42 s
# 原始模型时间效率
original_features_time =[]
for _ in range(10):
    start_time =time.time()
    rf.fit(original_train_features,original_train_labels)
    original_features_predictions =rf.predict(test_features[:,original_feature_indices])
    end_time =time.time()
    original_features_time.append(end_time -start_time)
original_features_time =np.mean(original_features_time)

print("使用原始模型测试的平均时间消耗:",round(original_features_time,2),'s')

使用原始模型测试的平均时间消耗: 0.17 s

不同特征 做一下对比

# 对比展示
all_accuracy = 100 * (1-np.mean(abs(all_features_predictions-test_labels)/test_labels))
reduced_accuracy = 100 * (1-np.mean(abs(reduced_features_predictions-test_labels)/test_labels))

# 保存结果并展示
comparision = pd.DataFrame({'features':['all(17)','reduced(5)'],
                           'runtime':[round(all_features_time,2),round(reduced_features_time,2)],
                           'accuracy':[round(all_accuracy,2),round(reduced_accuracy,2)]})
comparision[['features','accuracy','runtime']]

features accuracy runtime
0 all(17) 93.35 0.71
1 reduced(5) 93.28 0.42

# 时间效率可能会比准确率更加优先考虑
relative_accuracy_decrease =  100 * (all_accuracy - reduced_accuracy) / all_accuracy
print('相对accuracy提升:',round(relative_accuracy_decrease,3),"%")
relative_runtime_decrease = 100 * (all_features_time - reduced_features_time) / all_features_time
print("相对时间效率提升:",round(relative_runtime_decrease,3),"%")

相对accuracy提升: 0.071 %
相对时间效率提升: 40.663 %

# 原模型的预测温度对比
original_mae = np.mean(abs(original_features_predictions -test_labels))
# 所有特征预测温度对比
exp_all_mae = np.mean(abs(all_features_predictions -test_labels))
# 重要特征预测温度对比
exp_reduced_mae = np.mean(abs(reduced_features_predictions -test_labels))
# 原模型的准确率
original_accuracy = 100 * (1 - np.mean(abs(original_features_predictions - test_labels) /test_labels))
model_comparison = pd.DataFrame({'model': ['original', 'exp_all', 'exp_reduced'],
                                'error (degrees)': [original_mae, exp_all_mae, exp_reduced_mae],
                                'accuracy': [original_accuracy, all_accuracy, reduced_accuracy],
                                'run_time (s)': [original_features_time, all_features_time, reduced_features_time]})

# 汇聚所有实验结果
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1,ncols=3,figsize=(16,5),sharex=True)
# X轴
x_values = [0,1,2]
labels = list(model_comparison['model'])
plt.xticks(x_values,labels)
# 字体大小
fontdict = {'fontsize':18}
fontdict_yaxis = {'fontsize':14}
# 预测温度和真实温度的比对比
ax1.bar(x_values,model_comparison['error (degrees)'], color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax1.set_ylim(bottom=3.5, top=4.5)
ax1.set_ylabel('Error (degree) (F)',fontdict=fontdict_yaxis)
ax1.set_title('Model Error Comparison',fontdict=fontdict)
# 准确率对比
ax2.bar(x_values,model_comparison['accuracy'],color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax2.set_ylim(bottom=92, top=94)
ax2.set_ylabel('Accuracy (%)',fontdict=fontdict_yaxis)
ax2.set_title('Model Accuracy Comparision',fontdict=fontdict)
# 时间效率对比
ax3.bar(x_values,model_comparison['run_time (s)'], color=['b','r','g'],edgecolor='k',linewidth=1.5)
ax3.set_ylim(bottom=0,top=1)
ax3.set_ylabel('run_time (s)',fontdict=fontdict_yaxis)
ax3.set_title('Model Run-Time Comparison',fontdict=fontdict)
plt.show()


模型调参

from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
rf = RandomForestRegressor(random_state=42)
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}
###  自动随机调参
from sklearn.model_selection import RandomizedSearchCV
# 建立树的个数
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
# 最大特征的选择方法
max_features = ['auto','sqrt']
# 树最大深度
max_depth = [int(x) for x in np.linspace(10,20,num=2)]
max_depth.append(None)
# 节点最小分裂所需要的样本个数
min_samples_split = [2,5,10]
# 叶子节点最小的样本数
min_samples_leaf = [1,2,4]
# 样本采样方法
bootstrap = [True,False]
# 随机参数空间
random_grid ={'n_estimators':n_estimators,
             'max_features':max_features,
             'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'bootstrap':bootstrap}

####  随机组合参数
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, # 指定调参模型
                            param_distributions=random_grid, # 指定候选参数列表
                            n_iter=100, # 随机选择参数组合的个数,这里是随机选择100组,找这中间最好的
                            scoring='neg_mean_absolute_error', # 评估方法
                            cv=3, # 交叉验证
                            verbose=2, # 打印信息的数量
                            random_state=42, # 随机种子,随便选
                            n_jobs=-1) # 多线程数目,如果-1代表使用所有线程
# 寻找开始
rf_random.fit(train_features,train_labels)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits





{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}
# 评估结果
def evaluate(model,test_features,test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('平均气温误差:',np.mean(errors))
    print('Accuracy = {:0.2f}%'.format(accuracy))
# 默认参数结果
base_model = RandomForestRegressor(random_state=42)
base_model.fit(train_features,train_labels)
evaluate(base_model,test_features,test_labels)

平均气温误差: 4.036259124087591
Accuracy = 93.37%
# 随机组合最佳参数
best_random = rf_random.best_estimator_
evaluate(best_random,test_features,test_labels)

平均气温误差: 4.0074731175393135
Accuracy = 93.43%

网格参数搜索

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

from sklearn.model_selection import GridSearchCV
# 候选参数空间
param_grid = {
    'n_estimators':[1600,1700,1800,1900,2000],
    'max_features':['auto'],
    'max_depth':[8,10,12],
    'min_samples_split':[3,5,7],
    'min_samples_leaf':[2.3,4,5,6],
    'bootstrap':[True]
}
# 基本算法模型
rf = RandomForestRegressor()
# 网格搜索
grid_search = GridSearchCV(estimator=rf,
                          param_grid=param_grid,
                          scoring='neg_mean_absolute_error',
                          cv=3,
                          n_jobs=-1,
                          verbose=2)
# 搜索开始
grid_search.fit(train_features,train_labels)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
135 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "D:\anaconda3\lib\site-packages\joblib\parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
  File "D:\anaconda3\lib\site-packages\joblib\parallel.py", line 1792, in _get_sequential_output
    res = func(*args, **kwargs)
  File "D:\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
    return self.function(*args, **kwargs)
  File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 185, in _parallel_build_trees
    tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
  File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 1315, in fit
    super().fit(
  File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 242, in fit
    raise ValueError(
ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 2.3

  warnings.warn(some_fits_failed_message, FitFailedWarning)
D:\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [        nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -3.66715249 -3.67265289 -3.6666138
 -3.66880025 -3.66765682 -3.6725743  -3.66850699 -3.66715993 -3.66955895
 -3.66730326 -3.66887582 -3.66898245 -3.66910087 -3.66955339 -3.66925188
 -3.66767582 -3.66392168 -3.66590283 -3.66647468 -3.66916971 -3.66603121
 -3.66586079 -3.66445455 -3.66298478 -3.66498142 -3.66926415 -3.66660605
 -3.66211951 -3.66663106 -3.66897272 -3.66051875 -3.66402215 -3.66404952
 -3.66353607 -3.6642029  -3.66047745 -3.66229798 -3.6646911  -3.65990835
 -3.66086848 -3.66117259 -3.66397042 -3.66353509 -3.66311066 -3.6654521
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -3.66888733 -3.66731329 -3.66960138
 -3.6676715  -3.66823412 -3.6687913  -3.66691494 -3.66508719 -3.67174204
 -3.6732568  -3.66707472 -3.6662491  -3.67071602 -3.66858938 -3.6695038
 -3.66534122 -3.66134385 -3.66749329 -3.66597971 -3.66399534 -3.66496658
 -3.66704291 -3.66484829 -3.66720088 -3.6665224  -3.66248715 -3.66751772
 -3.66803523 -3.66671033 -3.66589929 -3.66162471 -3.66317662 -3.66164416
 -3.66021027 -3.66350166 -3.65923551 -3.66377361 -3.66143871 -3.6651609
 -3.66614199 -3.66217163 -3.66642478 -3.66285729 -3.66038393 -3.66225526
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan -3.67097272 -3.66544877 -3.67074033
 -3.67176597 -3.66914464 -3.66931447 -3.67020498 -3.66778535 -3.6680251
 -3.66782811 -3.66677228 -3.67086388 -3.66895445 -3.67114891 -3.67163594
 -3.6619978  -3.66453644 -3.66907959 -3.66519192 -3.66911132 -3.6655087
 -3.6684413  -3.6656255  -3.66152951 -3.66630327 -3.66651272 -3.66543072
 -3.66262121 -3.66430172 -3.66648642 -3.66166699 -3.66333263 -3.66292747
 -3.66399535 -3.66247052 -3.66596842 -3.66142884 -3.66444085 -3.66254073
 -3.66432689 -3.66124163 -3.65741632 -3.66360827 -3.66092641 -3.66143091]
  warnings.warn(





GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [8, 10, 12],
                         'max_features': ['auto'],
                         'min_samples_leaf': [2.3, 4, 5, 6],
                         'min_samples_split': [3, 5, 7],
                         'n_estimators': [1600, 1700, 1800, 1900, 2000]},
             scoring='neg_mean_absolute_error', verbose=2)
best_grid_search = grid_search.best_estimator_
evaluate(best_grid_search,test_features,test_labels)
平均气温误差: 4.004802708677475
Accuracy = 93.44%

最终模型

print('最终模型参数:\n')
pprint(best_grid_search.get_params())
最终模型参数:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 12,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 6,
 'min_samples_split': 7,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1700,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

贝叶斯优化寻找最佳参数

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
def hyperopt_train_test(params):
    clf = RandomForestRegressor(**params)
    return cross_val_score(clf,train_features,train_labels).mean()
max_depth = [i for i in range(10,20)]
# max_depth.append(None)
space4rf = {
    'max_depth': hp.choice('max_depth', max_depth),
    'max_features': hp.choice('max_features', ['auto','sqrt']),
    'min_samples_split':hp.choice('min_samples_split',range(5,20)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,10)),
    'n_estimators': hp.choice('n_estimators', range(1000,2000)),
    'bootstrap':hp.choice('bootstrap',[True,False])
}

best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
        print('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space4rf, algo=tpe.suggest, max_evals=100, trials=trials)
print("best:",best)

new best:                                                                                                              
0.766416424801337                                                                                                      
{'bootstrap': False, 'max_depth': 19, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 1028}
new best:                                                                                                              
0.8644180936765691                                                                                                     
{'bootstrap': False, 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 1363}
new best:                                                                                                              
0.8679831214513388                                                                                                     
{'bootstrap': True, 'max_depth': 12, 'max_features': 'auto', 'min_samples_leaf': 6, 'min_samples_split': 18, 'n_estimators': 1275}
new best:                                                                                                              
0.8683413950549937                                                                                                     
{'bootstrap': True, 'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 7, 'min_samples_split': 18, 'n_estimators': 1863}
new best:                                                                                                              
0.8683946223494816                                                                                                     
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1933}
new best:                                                                                                              
0.8684885517659223                                                                                                     
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1933}
new best:                                                                                                              
0.8686051137472097                                                                                                     
{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 9, 'min_samples_split': 6, 'n_estimators': 1408}
 34%|██████████████▉                             | 34/100 [25:19<1:06:39, 60.60s/trial, best loss: -0.8686051137472097]

贝叶斯优化的最大特点是每一次优化都对后面结果产生影响,而随机和网格每次都是独立的,不影响后面的结果,随机适合在开始时候不知道参数,网格用在随机后面。

posted @   AIIsFuture  阅读(536)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek “源神”启动!「GitHub 热点速览」
· 我与微信审核的“相爱相杀”看个人小程序副业
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 如何使用 Uni-app 实现视频聊天(源码,支持安卓、iOS)
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)
点击右上角即可分享
微信分享提示