天气最高温度
我们要完成三个任务
随机森林建模 --》 选择特征 - 》 增加数据量和特征个数 --》 找到最优的参数
掌握机器学习里面2种经典的参数调节方法
读数据
import pandas as pd
data = pd.read_csv("temps.csv" )
data.head()
year
month
day
week
temp_2
temp_1
average
actual
friend
0
2019
1
1
Fri
45
45
45.6
45
29
1
2019
1
2
Sat
44
45
45.7
44
61
2
2019
1
3
Sun
45
44
45.8
41
56
3
2019
1
4
Mon
44
41
45.9
40
53
4
2019
1
5
Tues
41
40
46.0
44
41
print (data.shape)
(348 , 9 )
data.describe()
year
month
day
temp_2
temp_1
average
actual
friend
count
348.0
348.000000
348.000000
348.000000
348.000000
348.000000
348.000000
348.000000
mean
2019.0
6.477011
15.514368
62.652299
62.701149
59.760632
62.543103
60.034483
std
0.0
3.498380
8.772982
12.165398
12.120542
10.527306
11.794146
15.626179
min
2019.0
1.000000
1.000000
35.000000
35.000000
45.100000
35.000000
28.000000
25%
2019.0
3.000000
8.000000
54.000000
54.000000
49.975000
54.000000
47.750000
50%
2019.0
6.000000
15.000000
62.500000
62.500000
58.200000
62.500000
60.000000
75%
2019.0
10.000000
23.000000
71.000000
71.000000
69.025000
71.000000
71.000000
max
2019.0
12.000000
31.000000
117.000000
117.000000
77.400000
92.000000
95.000000
时间处理函数
import datetime
years = data['year' ]
months = data['month' ]
days = data['day' ]
dates = [str (int (year)) + '-' + str (int (month)) + '-' + str (int (day)) for year , month, day in zip (years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d' ) for date in dates]
dates[:5 ]
[datetime.datetime(2019, 1, 1, 0, 0),
datetime.datetime(2019, 1, 2, 0, 0),
datetime.datetime(2019, 1, 3, 0, 0),
datetime.datetime(2019, 1, 4, 0, 0),
datetime.datetime(2019, 1, 5, 0, 0) ]
数据展示
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight' )
展示四个指标:分别为最高气温的标签值、前天、昨天、朋友预测的气温最高值,四个图。
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2 ,ncols=2 ,figsize=(10 ,10 ))
fig.autofmt_xdate(rotation=45 )
ax1.plot(dates,data['actual' ])
ax1.set_xlabel('' );ax1.set_ylabel('Temperature' );ax1.set_title('Max Temp' )
ax2.plot(dates,data['temp_1' ])
ax2.set_xlabel('' );ax2.set_ylabel('Temperature' );ax2.set_title('Yesterday Max Temp' )
ax3.plot(dates,data['temp_2' ])
ax3.set_xlabel('' );ax3.set_ylabel('Temperature' );ax3.set_title('Two Days Prior Max Temp' )
ax4.plot(dates,data['friend' ])
ax4.set_xlabel('' );ax4.set_ylabel('Temperature' );ax4.set_title('Friend Forcast' )
plt.tight_layout(pad=2 )
数据预处理,比如 周几,这个要转成 计算机可识别的数字
data = pd.get_dummies(data)
data.head(5 )
year
month
day
temp_2
temp_1
average
actual
friend
week_Fri
week_Mon
week_Sat
week_Sun
week_Thurs
week_Tues
week_Wed
0
2019
1
1
45
45
45.6
45
29
1
0
0
0
0
0
0
1
2019
1
2
44
45
45.7
44
61
0
0
1
0
0
0
0
2
2019
1
3
45
44
45.8
41
56
0
0
0
1
0
0
0
3
2019
1
4
44
41
45.9
40
53
0
1
0
0
0
0
0
4
2019
1
5
41
40
46.0
44
41
0
0
0
0
0
1
0
import numpy as np
labels = np.array(data['actual' ])
data = data.drop('actual' ,axis=1 )
feature_list = list (data.columns)
features = np.array(data)
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features,labels,test_size=0.25 ,random_state=42 )
print ('训练集特征:' ,train_features.shape)
print ('训练集标签:' ,train_labels.shape)
print ('测试集标签:' ,test_features.shape)
print ('测试机标签:' ,test_labels.shape)
训练集特征: (261 , 14 )
训练集标签: (261 ,)
测试集标签: (87 , 14 )
测试机标签: (87 ,)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1000 ,random_state=42 )
rf.fit(train_features,train_labels)
predictions = rf.predict(test_features)
errors = abs (predictions - test_labels)
mape = 100 * (errors / test_labels)
print ('MAPE:' ,np.mean(mape))
MAPE: 6.016378550202468
from sklearn.tree import export_graphviz
import pydot
import os
tree = rf.estimators_[5 ]
export_graphviz(tree,out_file="tree.dot" ,feature_names=feature_list,rounded=True ,precision=1 )
(graph,) = pydot.graph_from_dot_file('./tree.dot' )
graph.write_png('tree.png' )
rf_small = RandomForestRegressor(n_estimators=10 ,max_depth=3 ,random_state=42 )
rf_small.fit(train_features,train_labels)
tree_small = rf_small.estimators_[5 ]
export_graphviz(tree_small,out_file='small_tree.dot' ,feature_names=feature_list,rounded=True ,precision=1 )
(graph,) = pydot.graph_from_dot_file('small_tree.dot' )
graph.write_png('small_tree.png' )
importances = list (rf.feature_importances_)
feature_importances = [(feature, round (importance, 2 )) for feature, importance in zip (feature_list,importances)]
feature_importances = sorted (feature_importances,key=lambda x:x[1 ],reverse=True )
[print ('Variable:{:20} importance: {}' .format (*pair)) for pair in feature_importances]
Variable: temp_1 importance: 0.69
Variable: average importance: 0.2
Variable: day importance: 0.03
Variable: friend importance: 0.03
Variable: temp_2 importance: 0.02
Variable: month importance: 0.01
Variable: year importance: 0.0
Variable: week_Fri importance: 0.0
Variable: week_Mon importance: 0.0
Variable: week_Sat importance: 0.0
Variable: week_Sun importance: 0.0
Variable: week_Thurs importance: 0.0
Variable: week_Tues importance: 0.0
Variable: week_Wed importance: 0.0
[None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None]
x_values = list (range (len (importances)))
plt.bar(x_values,importances,orientation='vertical' )
plt.xticks(x_values,feature_list,rotation='vertical' )
plt.ylabel('Importance' );plt.xlabel('Variable' );plt.title('Variable Importances' )
Text(0.5 , 1.0 , 'Variable Importances')
rf_most_important = RandomForestRegressor(n_estimators=1000 ,random_state=42 )
important_indices = [feature_list.index('temp_1' ),feature_list.index('average' )]
train_important = train_features[:,important_indices]
test_important = test_features[:,important_indices]
rf_most_important.fit(train_important,train_labels)
predictions = rf_most_important.predict(test_important)
errors = abs (predictions-test_labels)
print ('Mean Absolute Error:' ,round (np.mean(errors),2 ),'%' )
mape = np.mean(100 *(errors/test_labels))
print ('mape:' ,mape)
Mean Absolute Error : 3.92 %
mape: 6.243108595734665
发现,mape的值从6.0上升到6.2,并没有下降,说明不能只选择最重要的特征
months = features[:,feature_list.index('month' )]
days = features[:,feature_list.index('day' )]
years = features[:,feature_list.index('year' )]
dates = [str (int (year))+'-' +str (int (month))+'-' +str (int (day)) for year, month, day in zip (years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d' ) for date in dates]
true_data = pd.DataFrame(data={'date' :dates,'actual' :labels})
months = test_features[:,feature_list.index('month' )]
days = test_features[:,feature_list.index('day' )]
years = test_features[:,feature_list.index('year' )]
test_dates = [str (int (year))+'-' +str (int (month))+'-' +str (int (day)) for year,month,day in zip (years,months,days)]
test_dates = [datetime.datetime.strptime(date,'%Y-%m-%d' ) for date in test_dates]
predictions_data = pd.DataFrame(data = {'date' :test_dates,'prediction' :predictions})
plt.plot(true_data['date' ],true_data['actual' ],'b-' ,label='actual' )
plt.plot(predictions_data['date' ],predictions_data['prediction' ],'ro' ,label='prediction' )
plt.xticks(rotation='60' )
plt.legend()
plt.xlabel('Date' ); plt.ylabel('Maximum Temperature (F)' ); plt.title('Acture and Predicted Values' );
数据增多,采用新的数据集
import pandas as pd
features = pd.read_csv('temps_extended.csv' )
features.head(5 )
year
month
day
weekday
ws_1
prcp_1
snwd_1
temp_2
temp_1
average
actual
friend
0
2011
1
1
Sat
4.92
0.00
0
36
37
45.6
40
40
1
2011
1
2
Sun
5.37
0.00
0
37
40
45.7
39
50
2
2011
1
3
Mon
6.26
0.00
0
40
39
45.8
42
42
3
2011
1
4
Tues
5.59
0.00
0
39
42
45.9
38
59
4
2011
1
5
Wed
3.80
0.03
0
42
38
46.0
45
39
print ('数据规模' ,features.shape)
数据规模 (2191 , 12 )
import datetime
years = features['year' ]
months = features['month' ]
days = features['day' ]
dates = [str (int (year)) + '-' + str (int (month)) + '-' + str (int (day)) for year, month, day in zip (years,months,days)]
dates = [datetime.datetime.strptime(date,'%Y-%m-%d' ) for date in dates]
dates[:5 ]
[datetime.datetime(2011, 1, 1, 0, 0),
datetime.datetime(2011, 1, 2, 0, 0),
datetime.datetime(2011, 1, 3, 0, 0),
datetime.datetime(2011, 1, 4, 0, 0),
datetime.datetime(2011, 1, 5, 0, 0) ]
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2 ,ncols=2 ,figsize=(15 ,10 ))
fig.autofmt_xdate(rotation=45 )
ax1.plot(dates,features['average' ])
ax1.set_xlabel('' );ax1.set_ylabel('Tempertature (F)' );ax1.set_title('Historical Avg Max Temp' )
ax2.plot(dates,features['ws_1' ],'r-' )
ax2.set_xlabel('' );ax2.set_ylabel('Wind Speed (mph))' );ax2.set_title('Prior Wind Speed' )
ax3.plot(dates,features['prcp_1' ],'r-' )
ax3.set_xlabel('Date' );ax3.set_ylabel('Precipitation (in)' );ax3.set_title('Prior Precipitation' )
ax4.plot(dates,features['snwd_1' ],'ro' )
ax4.set_xlabel('Date' );ax4.set_ylabel('Snow Depth (in)' );ax4.set_title('Prior Snow Depth' )
plt.tight_layout(pad=2 )
天气变换与季节因素有关,然而数据集中并没有体现季节的特征,可以自己创建
seasons = []
for month in features['month' ]:
if month in [1 ,2 ,12 ]:
seasons.append('winter' )
elif month in [3 ,4 ,5 ]:
seasons.append('spring' )
elif month in [6 ,7 ,8 ]:
seasons.append('summer' )
elif month in [9 ,10 ,11 ]:
seasons.append('fall' )
reduced_features = features[['temp_1' ,'prcp_1' ,'average' ,'actual' ]]
reduced_features['season' ] = seasons
C:\Users\Owner\AppData\Local \Temp\ipykernel_15292\2969630295. py:13 : SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https:/ / pandas.pydata.org/ pandas- docs/ stable/ user_guide/ indexing.html#returning- a- view - versus- a- copy
reduced_features['season' ] = seasons
import seaborn as sns
sns.set (style='ticks' ,color_codes=True )
palette = sns.xkcd_palette(['dark blue' ,'dark green' ,'gold' ,'orange' ])
sns.pairplot(reduced_features,hue='season' ,diag_kind='kde' ,palette=palette,plot_kws=dict (alpha=0.7 ),diag_kws=dict (shade=True ))
<seaborn. axisgrid. PairGrid at 0x23f1d1c4370 >
features = pd.get_dummies(features)
labels = features['actual' ]
features = features.drop('actual' ,axis=1 )
feature_list = list (features.columns)
import numpy as np
features = np.array(features)
labels = np.array(labels)
from sklearn.model_selection import train_test_split
划分新的数据集
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size=0.25 ,random_state=0 )
print ("训练集特征:" ,train_features.shape)
print ("训练集标签:" ,train_labels.shape)
print ("测试集特征:" ,test_features.shape)
print ("测试集标签:" ,test_labels.shape)
训练集特征: (1643 , 17 )
训练集标签: (1643 ,)
测试集特征: (548 , 17 )
测试集标签: (548 ,)
import pandas as pd
import numpy as np
original_feature_indices = [feature_list.index(feature) for feature in feature_list if feature not in ['ws_1' ,'prcp_1' ,'snwd_1' ]]
original_features = pd.read_csv('temps.csv' )
original_features = pd.get_dummies(original_features)
original_labels = np.array(original_features['actual' ])
original_features = original_features.drop('actual' ,axis=1 )
original_feature_list = list (original_features.columns)
original_features = np.array(original_features)
from sklearn.model_selection import train_test_split
original_train_features,original_test_features,original_train_labels,original_test_labels = train_test_split(original_features,original_labels,test_size=0.25 ,random_state=42 )
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100 ,random_state=0 )
rf.fit(original_train_features,original_train_labels)
predictions = rf.predict(test_features[:,original_feature_indices])
errors = abs (predictions-test_labels)
print ('老数据集平均温度误差:' ,round (np.mean(errors),2 ),'°' )
mape = 100 *(errors/test_labels)
accuracy = 100 -np.mean(mape)
print ('Accuracy:' ,round (accuracy,2 ),'%' )
老数据集平均温度误差: 4.68 °
Accuracy: 92.19 %
from sklearn.ensemble import RandomForestRegressor
original_train_changeed_features = train_features[:,original_feature_indices]
original_test_changed_features = test_features[:,original_feature_indices]
rf = RandomForestRegressor(n_estimators=100 ,random_state=0 )
rf.fit(original_train_changeed_features,train_labels)
baseline_predictions = rf.predict(original_test_changed_features)
baseline_errors = abs (baseline_predictions-test_labels)
print ('新数据集平均温度误差:' ,round (np.mean(baseline_errors),2 ),'%' )
baseline_mape = 100 * np.mean(baseline_errors/test_labels)
baseline_accuracy = 100 - baseline_mape
print ('Accuracy:' ,round (baseline_accuracy,2 ),'%' )
新数据集平均温度误差: 4.2 %
Accuracy: 93.12 %
数据增多以后,相同的特征,结果加强了
from sklearn.ensemble import RandomForestRegressor
rf_exp = RandomForestRegressor(n_estimators=100 ,random_state=0 )
rf_exp.fit(train_features,train_labels)
predictions = rf_exp.predict(test_features)
errors = abs (predictions - test_labels)
print ('平均温度误差:' ,round (np.mean(errors),2 ),"%" )
mape = np.mean(100 *(errors/test_labels))
improvement_baseline = 100 * abs (mape-baseline_mape) / baseline_mape
print ('特征增多以后模型效果变化:' ,round (improvement_baseline,2 ),'%' )
accuracy = 100 - mape
print ('Accuracy:' ,round (accuracy,2 ),'%' )
平均温度误差: 4.05
特征增多以后模型效果变化: 3.34
Accuracy: 93.35
重要特征
importances = list (rf_exp.feature_importances_)
feature_importances = [(feature,round (importance,2 )) for feature,importance in zip (feature_list,importances)]
feature_importances = sorted (feature_importances,key=lambda x:x[1 ],reverse=True )
[print ('Variable:{:20} Importance: {}' .format (*pair)) for pair in feature_importances]
Variable: temp_1 Importance: 0.85
Variable: average Importance: 0.05
Variable: ws_1 Importance: 0.02
Variable: friend Importance: 0.02
Variable: year Importance: 0.01
Variable: month Importance: 0.01
Variable: day Importance: 0.01
Variable: prcp_1 Importance: 0.01
Variable: temp_2 Importance: 0.01
Variable: snwd_1 Importance: 0.0
Variable: weekday_Fri Importance: 0.0
Variable: weekday_Mon Importance: 0.0
Variable: weekday_Sat Importance: 0.0
Variable: weekday_Sun Importance: 0.0
Variable: weekday_Thurs Importance: 0.0
Variable: weekday_Tues Importance: 0.0
Variable: weekday_Wed Importance: 0.0
[None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None]
plt.style.use('fivethirtyeight' )
x_values = list (range (len (importances)))
plt.bar(x_values,importances,orientation="vertical" ,color="r" ,edgecolor="k" ,linewidth=1.2 )
plt.xticks(x_values,feature_list,rotation='vertical' )
plt.ylabel('Importance' )
plt.xlabel('Variable' )
plt.title('Variable Importances' )
Text(0.5 , 1.0 , 'Variable Importances')
sorted_importances = [importance[1 ] for importance in feature_importances]
sorted_features = [importance[0 ] for importance in feature_importances]
cumulative_importances = np.cumsum(sorted_importances)
plt.plot(x_values,cumulative_importances,'g-' )
plt.hlines(y=0.95 ,xmin=0 ,xmax=len (sorted_importances),color='r' ,linestyles='dashed' )
plt.xticks(x_values,sorted_features,rotation='vertical' )
plt.xlabel('Variable' );plt.ylabel('Cumulative Importance' )
plt.title('Cumulative Importances' )
Text(0.5 , 1.0 , 'Cumulative Importances')
如果只使用这5个特征建模,观察结果
important_feature_names = [feature[0 ] for feature in feature_importances[0 :5 ]]
important_indices = [feature_list.index(feature) for feature in important_feature_names]
important_train_features = train_features[:,important_indices]
important_test_features = test_features[:,important_indices]
print ("important train features shape:" ,important_train_features.shape)
print ("important test features shape:" ,important_test_features.shape)
rf_exp.fit(important_train_features,train_labels)
predictions = rf_exp.predict(important_test_features)
errors = abs (predictions-test_labels)
print ('平均温度误差:' ,round (np.mean(errors),2 ),"°" )
mape = 100 *(errors/test_labels)
accuracy = 100 - np.mean(mape)
print ('Accuracy:' ,round (accuracy,2 ),"%" )
important train features shape: (1643, 5 )
important test features shape: (548, 5 )
平均温度误差: 4.11 °
Accuracy: 93.28 %
虽然没有提升效率,那么观察一下在模型时间效率上面有没有提高???
import time
all_features_time = []
for _ in range (10 ):
start_time = time.time()
rf_exp.fit(train_features,train_labels)
all_features_predictions = rf_exp.predict(test_features)
end_time = time.time()
all_features_time.append(end_time-start_time)
all_features_time = np.mean(all_features_time)
print ("使用所有特征与测试的平均时间消耗:" ,round (all_features_time,2 ),'s' )
使用所有特征与测试的平均时间消耗: 0.71 s
reduced_features_time = []
for _ in range (10 ):
start_time = time.time()
rf_exp.fit(important_train_features,train_labels)
reduced_features_predictions = rf_exp.predict(important_test_features)
end_time = time.time()
reduced_features_time.append(end_time-start_time)
reduced_features_time = np.mean(reduced_features_time)
print ("使用重要特征与测试的平均时间消耗:" ,round (reduced_features_time,2 ),'s' )
使用重要特征与测试的平均时间消耗: 0.42 s
original_features_time =[]
for _ in range (10 ):
start_time =time.time()
rf.fit(original_train_features,original_train_labels)
original_features_predictions =rf.predict(test_features[:,original_feature_indices])
end_time =time.time()
original_features_time.append(end_time -start_time)
original_features_time =np.mean(original_features_time)
print ("使用原始模型测试的平均时间消耗:" ,round (original_features_time,2 ),'s' )
使用原始模型测试的平均时间消耗: 0.17 s
不同特征 做一下对比
all_accuracy = 100 * (1 -np.mean(abs (all_features_predictions-test_labels)/test_labels))
reduced_accuracy = 100 * (1 -np.mean(abs (reduced_features_predictions-test_labels)/test_labels))
comparision = pd.DataFrame({'features' :['all(17)' ,'reduced(5)' ],
'runtime' :[round (all_features_time,2 ),round (reduced_features_time,2 )],
'accuracy' :[round (all_accuracy,2 ),round (reduced_accuracy,2 )]})
comparision[['features' ,'accuracy' ,'runtime' ]]
features
accuracy
runtime
0
all(17)
93.35
0.71
1
reduced(5)
93.28
0.42
relative_accuracy_decrease = 100 * (all_accuracy - reduced_accuracy) / all_accuracy
print ('相对accuracy提升:' ,round (relative_accuracy_decrease,3 ),"%" )
relative_runtime_decrease = 100 * (all_features_time - reduced_features_time) / all_features_time
print ("相对时间效率提升:" ,round (relative_runtime_decrease,3 ),"%" )
相对accuracy提升: 0.071
相对时间效率提升: 40.663
original_mae = np.mean(abs (original_features_predictions -test_labels))
exp_all_mae = np.mean(abs (all_features_predictions -test_labels))
exp_reduced_mae = np.mean(abs (reduced_features_predictions -test_labels))
original_accuracy = 100 * (1 - np.mean(abs (original_features_predictions - test_labels) /test_labels))
model_comparison = pd.DataFrame({'model' : ['original' , 'exp_all' , 'exp_reduced' ],
'error (degrees)' : [original_mae, exp_all_mae, exp_reduced_mae],
'accuracy' : [original_accuracy, all_accuracy, reduced_accuracy],
'run_time (s)' : [original_features_time, all_features_time, reduced_features_time]})
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1 ,ncols=3 ,figsize=(16 ,5 ),sharex=True )
x_values = [0 ,1 ,2 ]
labels = list (model_comparison['model' ])
plt.xticks(x_values,labels)
fontdict = {'fontsize' :18 }
fontdict_yaxis = {'fontsize' :14 }
ax1.bar(x_values,model_comparison['error (degrees)' ], color=['b' ,'r' ,'g' ],edgecolor='k' ,linewidth=1.5 )
ax1.set_ylim(bottom=3.5 , top=4.5 )
ax1.set_ylabel('Error (degree) (F)' ,fontdict=fontdict_yaxis)
ax1.set_title('Model Error Comparison' ,fontdict=fontdict)
ax2.bar(x_values,model_comparison['accuracy' ],color=['b' ,'r' ,'g' ],edgecolor='k' ,linewidth=1.5 )
ax2.set_ylim(bottom=92 , top=94 )
ax2.set_ylabel('Accuracy (%)' ,fontdict=fontdict_yaxis)
ax2.set_title('Model Accuracy Comparision' ,fontdict=fontdict)
ax3.bar(x_values,model_comparison['run_time (s)' ], color=['b' ,'r' ,'g' ],edgecolor='k' ,linewidth=1.5 )
ax3.set_ylim(bottom=0 ,top=1 )
ax3.set_ylabel('run_time (s)' ,fontdict=fontdict_yaxis)
ax3.set_title('Model Run-Time Comparison' ,fontdict=fontdict)
plt.show()
模型调参
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
rf = RandomForestRegressor(random_state=42 )
pprint(rf.get_params())
{'bootstrap' : True ,
'ccp_alpha' : 0.0 ,
'criterion' : 'squared_error' ,
'max_depth' : None ,
'max_features' : 'auto' ,
'max_leaf_nodes' : None ,
'max_samples' : None ,
'min_impurity_decrease' : 0.0 ,
'min_samples_leaf' : 1 ,
'min_samples_split' : 2 ,
'min_weight_fraction_leaf' : 0.0 ,
'n_estimators' : 100 ,
'n_jobs' : None ,
'oob_score' : False ,
'random_state' : 42 ,
'verbose' : 0 ,
'warm_start' : False }
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int (x) for x in np.linspace(start=200 ,stop=2000 ,num=10 )]
max_features = ['auto' ,'sqrt' ]
max_depth = [int (x) for x in np.linspace(10 ,20 ,num=2 )]
max_depth.append(None )
min_samples_split = [2 ,5 ,10 ]
min_samples_leaf = [1 ,2 ,4 ]
bootstrap = [True ,False ]
random_grid ={'n_estimators' :n_estimators,
'max_features' :max_features,
'max_depth' :max_depth,
'min_samples_split' :min_samples_split,
'min_samples_leaf' :min_samples_leaf,
'bootstrap' :bootstrap}
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf,
param_distributions=random_grid,
n_iter=100 ,
scoring='neg_mean_absolute_error' ,
cv=3 ,
verbose=2 ,
random_state=42 ,
n_jobs=-1 )
rf_random.fit(train_features,train_labels)
rf_random.best_params_
Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators' : 2000 ,
'min_samples_split' : 5 ,
'min_samples_leaf' : 4 ,
'max_features' : 'auto' ,
'max_depth' : None ,
'bootstrap' : True }
def evaluate (model,test_features,test_labels ):
predictions = model.predict(test_features)
errors = abs (predictions - test_labels)
mape = 100 * np.mean(errors / test_labels)
accuracy = 100 - mape
print ('平均气温误差:' ,np.mean(errors))
print ('Accuracy = {:0.2f}%' .format (accuracy))
base_model = RandomForestRegressor(random_state=42 )
base_model.fit(train_features,train_labels)
evaluate(base_model,test_features,test_labels)
平均气温误差: 4.036259124087591
Accuracy = 93.37%
best_random = rf_random.best_estimator_
evaluate(best_random,test_features,test_labels)
平均气温误差: 4.0074731175393135
Accuracy = 93.43%
网格参数搜索
{'n_estimators' : 1800 ,
'min_samples_split' : 10 ,
'min_samples_leaf' : 4 ,
'max_features' : 'auto' ,
'max_depth' : None ,
'bootstrap' : True }
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators' :[1600 ,1700 ,1800 ,1900 ,2000 ],
'max_features' :['auto' ],
'max_depth' :[8 ,10 ,12 ],
'min_samples_split' :[3 ,5 ,7 ],
'min_samples_leaf' :[2.3 ,4 ,5 ,6 ],
'bootstrap' :[True ]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf,
param_grid=param_grid,
scoring='neg_mean_absolute_error' ,
cv=3 ,
n_jobs=-1 ,
verbose=2 )
grid_search.fit(train_features,train_labels)
Fitting 3 folds for each of 180 candidates, totalling 540 fits
D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372 : FitFailedWarning:
135 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise' .
Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
File "D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py" , line 680 , in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py" , line 450 , in fit
trees = Parallel(
File "D:\anaconda3\lib\site-packages\joblib\parallel.py" , line 1863 , in __call__
return output if self.return_generator else list (output)
File "D:\anaconda3\lib\site-packages\joblib\parallel.py" , line 1792 , in _get_sequential_output
res = func(*args, **kwargs)
File "D:\anaconda3\lib\site-packages\sklearn\utils\fixes.py" , line 216 , in __call__
return self.function(*args, **kwargs)
File "D:\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py" , line 185 , in _parallel_build_trees
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False )
File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py" , line 1315 , in fit
super ().fit(
File "D:\anaconda3\lib\site-packages\sklearn\tree\_classes.py" , line 242 , in fit
raise ValueError(
ValueError: min_samples_leaf must be at least 1 or in (0 , 0.5 ], got 2.3
warnings.warn(some_fits_failed_message, FitFailedWarning)
D:\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969 : UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan -3.66715249 -3.67265289 -3.6666138
-3.66880025 -3.66765682 -3.6725743 -3.66850699 -3.66715993 -3.66955895
-3.66730326 -3.66887582 -3.66898245 -3.66910087 -3.66955339 -3.66925188
-3.66767582 -3.66392168 -3.66590283 -3.66647468 -3.66916971 -3.66603121
-3.66586079 -3.66445455 -3.66298478 -3.66498142 -3.66926415 -3.66660605
-3.66211951 -3.66663106 -3.66897272 -3.66051875 -3.66402215 -3.66404952
-3.66353607 -3.6642029 -3.66047745 -3.66229798 -3.6646911 -3.65990835
-3.66086848 -3.66117259 -3.66397042 -3.66353509 -3.66311066 -3.6654521
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan -3.66888733 -3.66731329 -3.66960138
-3.6676715 -3.66823412 -3.6687913 -3.66691494 -3.66508719 -3.67174204
-3.6732568 -3.66707472 -3.6662491 -3.67071602 -3.66858938 -3.6695038
-3.66534122 -3.66134385 -3.66749329 -3.66597971 -3.66399534 -3.66496658
-3.66704291 -3.66484829 -3.66720088 -3.6665224 -3.66248715 -3.66751772
-3.66803523 -3.66671033 -3.66589929 -3.66162471 -3.66317662 -3.66164416
-3.66021027 -3.66350166 -3.65923551 -3.66377361 -3.66143871 -3.6651609
-3.66614199 -3.66217163 -3.66642478 -3.66285729 -3.66038393 -3.66225526
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan -3.67097272 -3.66544877 -3.67074033
-3.67176597 -3.66914464 -3.66931447 -3.67020498 -3.66778535 -3.6680251
-3.66782811 -3.66677228 -3.67086388 -3.66895445 -3.67114891 -3.67163594
-3.6619978 -3.66453644 -3.66907959 -3.66519192 -3.66911132 -3.6655087
-3.6684413 -3.6656255 -3.66152951 -3.66630327 -3.66651272 -3.66543072
-3.66262121 -3.66430172 -3.66648642 -3.66166699 -3.66333263 -3.66292747
-3.66399535 -3.66247052 -3.66596842 -3.66142884 -3.66444085 -3.66254073
-3.66432689 -3.66124163 -3.65741632 -3.66360827 -3.66092641 -3.66143091 ]
warnings.warn(
GridSearchCV(cv=3 , estimator=RandomForestRegressor(), n_jobs=-1 ,
param_grid={'bootstrap' : [True ], 'max_depth' : [8 , 10 , 12 ],
'max_features' : ['auto' ],
'min_samples_leaf' : [2.3 , 4 , 5 , 6 ],
'min_samples_split' : [3 , 5 , 7 ],
'n_estimators' : [1600 , 1700 , 1800 , 1900 , 2000 ]},
scoring='neg_mean_absolute_error' , verbose=2 )
best_grid_search = grid_search.best_estimator_
evaluate(best_grid_search,test_features,test_labels)
平均气温误差: 4.004802708677475
Accuracy = 93.44%
最终模型
print ('最终模型参数:\n' )
pprint(best_grid_search.get_params())
最终模型参数:
{'bootstrap' : True ,
'ccp_alpha' : 0.0 ,
'criterion' : 'squared_error' ,
'max_depth' : 12 ,
'max_features' : 'auto' ,
'max_leaf_nodes' : None ,
'max_samples' : None ,
'min_impurity_decrease' : 0.0 ,
'min_samples_leaf' : 6 ,
'min_samples_split' : 7 ,
'min_weight_fraction_leaf' : 0.0 ,
'n_estimators' : 1700 ,
'n_jobs' : None ,
'oob_score' : False ,
'random_state' : None ,
'verbose' : 0 ,
'warm_start' : False }
贝叶斯优化寻找最佳参数
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
def hyperopt_train_test (params ):
clf = RandomForestRegressor(**params)
return cross_val_score(clf,train_features,train_labels).mean()
max_depth = [i for i in range (10 ,20 )]
space4rf = {
'max_depth' : hp.choice('max_depth' , max_depth),
'max_features' : hp.choice('max_features' , ['auto' ,'sqrt' ]),
'min_samples_split' :hp.choice('min_samples_split' ,range (5 ,20 )),
'min_samples_leaf' :hp.choice('min_samples_leaf' ,range (2 ,10 )),
'n_estimators' : hp.choice('n_estimators' , range (1000 ,2000 )),
'bootstrap' :hp.choice('bootstrap' ,[True ,False ])
}
best = 0
def f (params ):
global best
acc = hyperopt_train_test(params)
if acc > best:
best = acc
print ('new best:' , best, params)
return {'loss' : -acc, 'status' : STATUS_OK}
trials = Trials()
best = fmin(f, space4rf, algo=tpe.suggest, max_evals=100 , trials=trials)
print ("best:" ,best)
new best:
0.766416424801337
{'bootstrap': False , 'max_depth': 19 , 'max_features': 'auto' , 'min_samples_leaf': 2 , 'min_samples_split': 7 , 'n_estimators': 1028 }
new best:
0.8644180936765691
{'bootstrap': False , 'max_depth': 19 , 'max_features': 'sqrt' , 'min_samples_leaf': 2 , 'min_samples_split': 15 , 'n_estimators': 1363 }
new best:
0.8679831214513388
{'bootstrap': True , 'max_depth': 12 , 'max_features': 'auto' , 'min_samples_leaf': 6 , 'min_samples_split': 18 , 'n_estimators': 1275 }
new best:
0.8683413950549937
{'bootstrap': True , 'max_depth': 14 , 'max_features': 'auto' , 'min_samples_leaf': 7 , 'min_samples_split': 18 , 'n_estimators': 1863 }
new best:
0.8683946223494816
{'bootstrap': True , 'max_depth': 15 , 'max_features': 'auto' , 'min_samples_leaf': 9 , 'min_samples_split': 6 , 'n_estimators': 1933 }
new best:
0.8684885517659223
{'bootstrap': True , 'max_depth': 15 , 'max_features': 'auto' , 'min_samples_leaf': 9 , 'min_samples_split': 6 , 'n_estimators': 1933 }
new best:
0.8686051137472097
{'bootstrap': True , 'max_depth': 15 , 'max_features': 'auto' , 'min_samples_leaf': 9 , 'min_samples_split': 6 , 'n_estimators': 1408 }
34 %|██████████████▉ | 34 /100 [25 :19<1:06:39 , 60. 60s/trial , best loss: -0.8686051137472097 ]
贝叶斯优化的最大特点是每一次优化都对后面结果产生影响,而随机和网格每次都是独立的,不影响后面的结果,随机适合在开始时候不知道参数,网格用在随机后面。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek “源神”启动!「GitHub 热点速览」
· 我与微信审核的“相爱相杀”看个人小程序副业
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 如何使用 Uni-app 实现视频聊天(源码,支持安卓、iOS)
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)