嵌入法与包装法
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 准备数据
data = pd.read_csv("./digit recognizor.csv")
x = data.iloc[:,1:] # 特征矩阵
y = data.iloc[:,0] # 目标矩阵
Embedded 嵌入法
feature_selection.SelectFromModel
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
rfc = RFC(n_estimators=10, random_state=0)
x_embedded = SelectFromModel(rfc, threshold=0.005).fit_transform(x, y)
x_embedded.shape
(42000, 47)
rfc.fit(x,y).feature_importances_.max()
0.01276360214820271
# 画threshold学习曲线
threshlds = np.linspace(0, rfc.fit(x, y).feature_importances_.max(), 20)
score = []
for i in threshlds:
x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
score.append(cross_score)
plt.plot(threshlds,score)
plt.xticks(threshlds,rotation=70)
plt.show()
x_embedded = SelectFromModel(rfc, threshold=0.00067).fit_transform(x, y)
cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
cross_score
0.9421904761904761
# 细化学习曲线
threshlds2 = np.linspace(0, 0.0000353, 20)
score2 = []
for i in threshlds2:
x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
score2.append(cross_score)
plt.figure(figsize=(20,8))
plt.plot(threshlds2,score2)
plt.xticks(threshlds2,rotation=70)
plt.show()
x_embedded2 = SelectFromModel(rfc, threshold=0.00001858).fit_transform(x, y)
cross_score2 = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded2, y, cv=10).mean()
cross_score2
0.9424523809523808
# 提高n_estimators
cross_score3 = cross_val_score(RFC(n_estimators=100, random_state=0), x_embedded2, y, cv=10).mean()
cross_score3
0.9656428571428572
Wrapper 包装法
feature_selection.RFE
from sklearn.feature_selection import RFE
rfc = RFC(n_estimators=10, random_state=0)
seletor = RFE(rfc, n_features_to_select=340, step=50).fit(x, y)
seletor.support_.sum()
340
seletor.ranking_
array([10, 9, 8, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 6, 6,
5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 5, 4,
4, 5, 3, 4, 4, 4, 5, 4, 5, 7, 6, 7, 7, 7, 8, 8, 8,
8, 8, 8, 8, 8, 6, 7, 4, 3, 1, 2, 3, 3, 1, 1, 1, 1,
1, 3, 3, 4, 5, 5, 5, 8, 8, 9, 9, 9, 9, 8, 9, 9, 4,
4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 4,
5, 5, 9, 9, 10, 10, 10, 10, 7, 4, 4, 3, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 5, 8, 10, 10, 10,
10, 9, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 3, 4, 10, 10, 10, 10, 9, 7, 4, 3, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
4, 4, 10, 9, 10, 6, 6, 4, 2, 3, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 9, 10, 8, 7,
4, 5, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 2, 1, 2, 4, 10, 10, 10, 9, 7, 5, 3, 3, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 5,
5, 9, 9, 9, 7, 5, 5, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 5, 9, 9, 9, 9, 9, 5,
4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 4, 5, 7, 10, 10, 9, 10, 9, 4, 1, 2, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 5, 10,
9, 10, 10, 9, 7, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 8, 9, 10, 10, 10, 5, 4,
2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 3, 5, 10, 10, 10, 10, 9, 5, 4, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 5, 9,
10, 10, 10, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 3, 3, 4, 8, 8, 10, 10, 9, 5, 3, 3,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
3, 3, 4, 10, 10, 10, 10, 8, 4, 3, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 5, 8, 10, 10,
10, 10, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 4, 7, 10, 10, 10, 10, 8, 5, 3, 2, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
5, 5, 7, 9, 9, 9, 9, 5, 5, 2, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 4, 5, 5, 8, 9, 9, 9,
9, 7, 4, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3, 3, 3, 5, 5, 9, 8, 9, 9, 9, 9, 9, 5, 4, 4, 2, 2,
1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 3, 4, 5, 5, 9,
8, 8, 8, 8, 8, 8, 7, 8, 6, 4, 2, 2, 1, 1, 2, 2, 1,
2, 2, 3, 2, 2, 4, 4, 5, 5, 8, 8, 8, 7, 7, 7, 7, 7,
7, 7, 5, 5, 4, 5, 4, 3, 3, 3, 4, 3, 3, 4, 3, 4, 5,
5, 6, 7, 7, 7, 6, 7, 8, 8, 8, 9, 9, 9, 9, 6, 8, 8,
8, 7, 8, 8, 8, 7, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 9,
10, 7])
x_wrapper = seletor.transform(x)
cross_score4 = cross_val_score(rfc, x_wrapper, y, cv=10).mean()
cross_score4
0.9418095238095239
cross_score5 = cross_val_score(RFC(n_estimators=100, random_state=0), x_wrapper, y, cv=10).mean()
cross_score5
0.9651190476190477