Loading

嵌入法与包装法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 准备数据
data = pd.read_csv("./digit recognizor.csv")
x = data.iloc[:,1:] # 特征矩阵
y = data.iloc[:,0] # 目标矩阵

Embedded 嵌入法

feature_selection.SelectFromModel

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score

rfc = RFC(n_estimators=10, random_state=0)
x_embedded = SelectFromModel(rfc, threshold=0.005).fit_transform(x, y)
x_embedded.shape
(42000, 47)
rfc.fit(x,y).feature_importances_.max()
0.01276360214820271
# 画threshold学习曲线
threshlds = np.linspace(0, rfc.fit(x, y).feature_importances_.max(), 20)
score = []
for i in threshlds:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
    cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
    score.append(cross_score)
plt.plot(threshlds,score)
plt.xticks(threshlds,rotation=70)
plt.show()

x_embedded = SelectFromModel(rfc, threshold=0.00067).fit_transform(x, y)
cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
cross_score
0.9421904761904761
# 细化学习曲线
threshlds2 = np.linspace(0, 0.0000353, 20)
score2 = []
for i in threshlds2:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
    cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
    score2.append(cross_score)
plt.figure(figsize=(20,8))
plt.plot(threshlds2,score2)
plt.xticks(threshlds2,rotation=70)
plt.show()


x_embedded2 = SelectFromModel(rfc, threshold=0.00001858).fit_transform(x, y)
cross_score2 = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded2, y, cv=10).mean()
cross_score2
0.9424523809523808
# 提高n_estimators
cross_score3 = cross_val_score(RFC(n_estimators=100, random_state=0), x_embedded2, y, cv=10).mean()
cross_score3
0.9656428571428572

Wrapper 包装法

feature_selection.RFE

from sklearn.feature_selection import RFE
rfc = RFC(n_estimators=10, random_state=0)
seletor = RFE(rfc, n_features_to_select=340, step=50).fit(x, y)
seletor.support_.sum()
340
seletor.ranking_
array([10,  9,  8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  6,  6,
        5,  6,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  6,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  5,  4,
        4,  5,  3,  4,  4,  4,  5,  4,  5,  7,  6,  7,  7,  7,  8,  8,  8,
        8,  8,  8,  8,  8,  6,  7,  4,  3,  1,  2,  3,  3,  1,  1,  1,  1,
        1,  3,  3,  4,  5,  5,  5,  8,  8,  9,  9,  9,  9,  8,  9,  9,  4,
        4,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  4,
        5,  5,  9,  9, 10, 10, 10, 10,  7,  4,  4,  3,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  5,  8, 10, 10, 10,
       10,  9,  4,  4,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  3,  4, 10, 10, 10, 10,  9,  7,  4,  3,  2,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        4,  4, 10,  9, 10,  6,  6,  4,  2,  3,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5,  9, 10,  8,  7,
        4,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  2,  1,  2,  4, 10, 10, 10,  9,  7,  5,  3,  3,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  5,
        5,  9,  9,  9,  7,  5,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  5,  9,  9,  9,  9,  9,  5,
        4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  4,  5,  7, 10, 10,  9, 10,  9,  4,  1,  2,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5, 10,
        9, 10, 10,  9,  7,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  2,  2,  4,  8,  9, 10, 10, 10,  5,  4,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  2,  3,  5, 10, 10, 10, 10,  9,  5,  4,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  4,  5,  9,
       10, 10, 10,  5,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  3,  3,  4,  8,  8, 10, 10,  9,  5,  3,  3,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        3,  3,  4, 10, 10, 10, 10,  8,  4,  3,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  5,  8, 10, 10,
       10, 10,  5,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  2,  4,  7, 10, 10, 10, 10,  8,  5,  3,  2,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,
        5,  5,  7,  9,  9,  9,  9,  5,  5,  2,  2,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  2,  2,  2,  3,  4,  5,  5,  8,  9,  9,  9,
        9,  7,  4,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        3,  3,  3,  5,  5,  9,  8,  9,  9,  9,  9,  9,  5,  4,  4,  2,  2,
        1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  2,  2,  3,  4,  5,  5,  9,
        8,  8,  8,  8,  8,  8,  7,  8,  6,  4,  2,  2,  1,  1,  2,  2,  1,
        2,  2,  3,  2,  2,  4,  4,  5,  5,  8,  8,  8,  7,  7,  7,  7,  7,
        7,  7,  5,  5,  4,  5,  4,  3,  3,  3,  4,  3,  3,  4,  3,  4,  5,
        5,  6,  7,  7,  7,  6,  7,  8,  8,  8,  9,  9,  9,  9,  6,  8,  8,
        8,  7,  8,  8,  8,  7,  8,  8,  8,  8,  8,  7,  8,  8,  8,  8,  9,
       10,  7])
x_wrapper = seletor.transform(x)
cross_score4 = cross_val_score(rfc, x_wrapper, y, cv=10).mean()
cross_score4
0.9418095238095239
cross_score5 = cross_val_score(RFC(n_estimators=100, random_state=0), x_wrapper, y, cv=10).mean()
cross_score5
0.9651190476190477
posted @ 2023-04-08 21:08  ThankCAT  阅读(24)  评论(0编辑  收藏  举报