嵌入法与包装法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 准备数据
data = pd.read_csv("./digit recognizor.csv")
x = data.iloc[:,1:] # 特征矩阵
y = data.iloc[:,0] # 目标矩阵

Embedded 嵌入法

feature_selection.SelectFromModel#

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score

rfc = RFC(n_estimators=10, random_state=0)
x_embedded = SelectFromModel(rfc, threshold=0.005).fit_transform(x, y)
x_embedded.shape
(42000, 47)
rfc.fit(x,y).feature_importances_.max()
0.01276360214820271
# 画threshold学习曲线
threshlds = np.linspace(0, rfc.fit(x, y).feature_importances_.max(), 20)
score = []
for i in threshlds:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
    cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
    score.append(cross_score)
plt.plot(threshlds,score)
plt.xticks(threshlds,rotation=70)
plt.show()

x_embedded = SelectFromModel(rfc, threshold=0.00067).fit_transform(x, y)
cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
cross_score
0.9421904761904761
# 细化学习曲线
threshlds2 = np.linspace(0, 0.0000353, 20)
score2 = []
for i in threshlds2:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(x, y)
    cross_score = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded, y, cv=10).mean()
    score2.append(cross_score)
plt.figure(figsize=(20,8))
plt.plot(threshlds2,score2)
plt.xticks(threshlds2,rotation=70)
plt.show()


x_embedded2 = SelectFromModel(rfc, threshold=0.00001858).fit_transform(x, y)
cross_score2 = cross_val_score(RFC(n_estimators=10, random_state=0), x_embedded2, y, cv=10).mean()
cross_score2
0.9424523809523808
# 提高n_estimators
cross_score3 = cross_val_score(RFC(n_estimators=100, random_state=0), x_embedded2, y, cv=10).mean()
cross_score3
0.9656428571428572

Wrapper 包装法

feature_selection.RFE#

from sklearn.feature_selection import RFE
rfc = RFC(n_estimators=10, random_state=0)
seletor = RFE(rfc, n_features_to_select=340, step=50).fit(x, y)
seletor.support_.sum()
340
seletor.ranking_
array([10,  9,  8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  6,  6,
        5,  6,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  6,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  5,  4,
        4,  5,  3,  4,  4,  4,  5,  4,  5,  7,  6,  7,  7,  7,  8,  8,  8,
        8,  8,  8,  8,  8,  6,  7,  4,  3,  1,  2,  3,  3,  1,  1,  1,  1,
        1,  3,  3,  4,  5,  5,  5,  8,  8,  9,  9,  9,  9,  8,  9,  9,  4,
        4,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  4,
        5,  5,  9,  9, 10, 10, 10, 10,  7,  4,  4,  3,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  5,  8, 10, 10, 10,
       10,  9,  4,  4,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  3,  4, 10, 10, 10, 10,  9,  7,  4,  3,  2,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        4,  4, 10,  9, 10,  6,  6,  4,  2,  3,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5,  9, 10,  8,  7,
        4,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  2,  1,  2,  4, 10, 10, 10,  9,  7,  5,  3,  3,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  5,
        5,  9,  9,  9,  7,  5,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  5,  9,  9,  9,  9,  9,  5,
        4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  4,  5,  7, 10, 10,  9, 10,  9,  4,  1,  2,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5, 10,
        9, 10, 10,  9,  7,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  2,  2,  4,  8,  9, 10, 10, 10,  5,  4,
        2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  2,  3,  5, 10, 10, 10, 10,  9,  5,  4,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  3,  4,  5,  9,
       10, 10, 10,  5,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  3,  3,  4,  8,  8, 10, 10,  9,  5,  3,  3,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        3,  3,  4, 10, 10, 10, 10,  8,  4,  3,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  4,  5,  8, 10, 10,
       10, 10,  5,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  2,  4,  7, 10, 10, 10, 10,  8,  5,  3,  2,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,
        5,  5,  7,  9,  9,  9,  9,  5,  5,  2,  2,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  2,  2,  2,  3,  4,  5,  5,  8,  9,  9,  9,
        9,  7,  4,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        3,  3,  3,  5,  5,  9,  8,  9,  9,  9,  9,  9,  5,  4,  4,  2,  2,
        1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  2,  2,  3,  4,  5,  5,  9,
        8,  8,  8,  8,  8,  8,  7,  8,  6,  4,  2,  2,  1,  1,  2,  2,  1,
        2,  2,  3,  2,  2,  4,  4,  5,  5,  8,  8,  8,  7,  7,  7,  7,  7,
        7,  7,  5,  5,  4,  5,  4,  3,  3,  3,  4,  3,  3,  4,  3,  4,  5,
        5,  6,  7,  7,  7,  6,  7,  8,  8,  8,  9,  9,  9,  9,  6,  8,  8,
        8,  7,  8,  8,  8,  7,  8,  8,  8,  8,  8,  7,  8,  8,  8,  8,  9,
       10,  7])
x_wrapper = seletor.transform(x)
cross_score4 = cross_val_score(rfc, x_wrapper, y, cv=10).mean()
cross_score4
0.9418095238095239
cross_score5 = cross_val_score(RFC(n_estimators=100, random_state=0), x_wrapper, y, cv=10).mean()
cross_score5
0.9651190476190477

作者:Hovey

出处:https://www.cnblogs.com/thankcat/p/17299254.html

版权:本作品采用「署名-非商业性使用-相同方式共享 4.0 国际」许可协议进行许可。

posted @   ThankCAT  阅读(25)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
more_horiz
keyboard_arrow_up dark_mode palette
选择主题
menu
点击右上角即可分享
微信分享提示