新增自定义特征

一. 特征评价

1. 互信息
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores
#图像绘制
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
2. 交叉验证
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

二. 特征添加

1. 数值直接运算
X_1 = pd.DataFrame()
X_1["LivLotRatio"] = df.GrLivArea / df.LotArea
X_1["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
X_1["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch
2. 与分类数据运算
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
# Multiply
X_2 = X_2.mul(df.GrLivArea, axis=0)
3. 出现频率
X_3 = pd.DataFrame()
X_3["PorchTypes"] = df[[
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "Threeseasonporch",
    "ScreenPorch",
]].gt(0.0).sum(axis=1)
4. 分解提取分类特征
#先观察对应离散非数值数据
df.MSSubClass.unique()

X_4 = pd.DataFrame()
#提取出第一个'_'前的字符串
X_4["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]
5. 提取分组特征
X_5 = pd.DataFrame()
#Other handy methods include max, min,mean, median, var, std, and count
X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
6. k-means聚类特征
from sklearn.cluster import KMeans
# Define a list of the features to be used for the clustering
features = []
# Standardize
X_scaled = X.loc[:, features]
X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)

#聚类标签特征
kmeans = KMeans(n_clusters=10, random_state=0)
X["Cluster"] = kmeans.fit_predict(X_scaled)

#类别离各中心的距离特征
kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
# Create the cluster-distance features using `fit_transform`
X_cd = kmeans.fit_transform(X_scaled)
# Label features and join to dataset
X_cd = pd.DataFrame(X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])])
7. 主成分分析(降维特征)
#PCA
from sklearn.decomposition import PCA
def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings
6. 合并
X_new = X.join([X_1, X_2, X_3, X_4, X_5])
X = X.join(X_cd)
X = X.join(X_pca)
``` 
posted @ 2022-07-06 17:13  失控D大白兔  阅读(40)  评论(0编辑  收藏  举报