嵌入法与包装法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 准备数据
data = pd.read_csv("./digit recognizor.csv")
x = data.iloc[:,1:] # 特征矩阵
y = data.iloc[:,0] # 目标矩阵

Embedded 嵌入法

feature_selection.SelectFromModel

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score

rfc = RFC(n_estimators=10, random_state=0)
x_embedded = SelectFromModel(rfc, threshold=0.005).fit_transform(x, y)
x_embedded.shape

(42000, 47)

rfc.fit(x,y).feature_importances_.max()

0.01276360214820271

# 画threshold学习曲线
threshlds = np.linspace(0, rfc.fit(x, y).feature_importances_.max(), 20)
score = []
for i in threshlds:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
    cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
    score.append(cross_score)
plt.plot(threshlds,score)
plt.xticks(threshlds,rotation=70)
plt.show()

x_embedded = SelectFromModel(rfc, threshold=0.00067).fit_transform(x, y)
cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
cross_score

0.9421904761904761

# 细化学习曲线
threshlds2 = np.linspace(0, 0.0000353, 20)
score2 = []
for i in threshlds2:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
    cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
    score2.append(cross_score)
plt.figure(figsize=(20,8))
plt.plot(threshlds2,score2)
plt.xticks(threshlds2,rotation=70)
plt.show()

x_embedded2 = SelectFromModel(rfc, threshold=0.00001858).fit_transform(x, y)
cross_score2 = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded2, y, cv=10).mean()
cross_score2

0.9424523809523808

# 提高n_estimators
cross_score3 = cross_val_score(RFC(n_estimators=100, random_state=0), x_embedded2, y, cv=10).mean()
cross_score3

0.9656428571428572

Wrapper 包装法

feature_selection.RFE

from sklearn.feature_selection import RFE
rfc = RFC(n_estimators=10, random_state=0)
seletor = RFE(rfc, n_features_to_select=340, step=50).fit(x, y)

seletor.support_.sum()

seletor.ranking_

array([10,  9,  8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  6,  6,
        5,  6,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  6,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  5,  4,
        4,  5,  3,  4,  4,  4,  5,  4,  5,  7,  6,  7,  7,  7,  8,  8,  8,
        8,  8,  8,  8,  8,  6,  7,  4,  3,  1,  2,  3,  3,  1,  1,  1,  1,
        1,  3,  3,  4,  5,  5,  5,  8,  8,  9,  9,  9,  9,  8,  9,  9,  4,
        4,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  4,
        5,  5,  9,  9, 10, 10, 10, 10,  7,  4,  4,  3,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  5,  8, 10, 10, 10,
       10,  9,  4,  4,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  3,  4, 10, 10, 10, 10,  9,  7,  4,  3,  2,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        4,  4, 10,  9, 10,  6,  6,  4,  2,  3,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5,  9, 10,  8,  7,
        4,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  2,  1,  2,  4, 10, 10, 10,  9,  7,  5,  3,  3,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  5,
        5,  9,  9,  9,  7,  5,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  5,  9,  9,  9,  9,  9,  5,
        4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  4,  5,  7, 10, 10,  9, 10,  9,  4,  1,  2,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5, 10,
        9, 10, 10,  9,  7,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  2,  2,  4,  8,  9, 10, 10, 10,  5,  4,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  2,  3,  5, 10, 10, 10, 10,  9,  5,  4,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  4,  5,  9,
       10, 10, 10,  5,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  3,  3,  4,  8,  8, 10, 10,  9,  5,  3,  3,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        3,  3,  4, 10, 10, 10, 10,  8,  4,  3,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  5,  8, 10, 10,
       10, 10,  5,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  2,  4,  7, 10, 10, 10, 10,  8,  5,  3,  2,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,
        5,  5,  7,  9,  9,  9,  9,  5,  5,  2,  2,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  2,  2,  2,  3,  4,  5,  5,  8,  9,  9,  9,
        9,  7,  4,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        3,  3,  3,  5,  5,  9,  8,  9,  9,  9,  9,  9,  5,  4,  4,  2,  2,
        1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  2,  2,  3,  4,  5,  5,  9,
        8,  8,  8,  8,  8,  8,  7,  8,  6,  4,  2,  2,  1,  1,  2,  2,  1,
        2,  2,  3,  2,  2,  4,  4,  5,  5,  8,  8,  8,  7,  7,  7,  7,  7,
        7,  7,  5,  5,  4,  5,  4,  3,  3,  3,  4,  3,  3,  4,  3,  4,  5,
        5,  6,  7,  7,  7,  6,  7,  8,  8,  8,  9,  9,  9,  9,  6,  8,  8,
        8,  7,  8,  8,  8,  7,  8,  8,  8,  8,  8,  7,  8,  8,  8,  8,  9,
       10,  7])

x_wrapper = seletor.transform(x)

cross_score4 = cross_val_score(rfc, x_wrapper, y, cv=10).mean()
cross_score4

0.9418095238095239

cross_score5 = cross_val_score(RFC(n_estimators=100, random_state=0), x_wrapper, y, cv=10).mean()
cross_score5

0.9651190476190477

posted @ 2023-04-08 21:08 ThankCAT 阅读(24) 评论(0) 编辑收藏举报

刷新页面返回顶部

Loading

Thank CAT

嵌入法与包装法

Embedded 嵌入法

feature_selection.SelectFromModel

Wrapper 包装法

feature_selection.RFE

公告