一. 特征评价
1. 互信息
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
def make_mi_scores(X, y):
X = X.copy()
for colname in X.select_dtypes(["object", "category"]):
X[colname], _ = X[colname].factorize()
# All discrete features should now have integer dtypes
discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
return mi_scores
#图像绘制
def plot_mi_scores(scores):
scores = scores.sort_values(ascending=True)
width = np.arange(len(scores))
ticks = list(scores.index)
plt.barh(width, scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
2. 交叉验证
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
def score_dataset(X, y, model=XGBRegressor()):
# Label encoding for categoricals
for colname in X.select_dtypes(["category", "object"]):
X[colname], _ = X[colname].factorize()
# Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
score = cross_val_score(
model, X, y, cv=5, scoring="neg_mean_squared_log_error",
)
score = -1 * score.mean()
score = np.sqrt(score)
return score
二. 特征添加
1. 数值直接运算
X_1 = pd.DataFrame()
X_1["LivLotRatio"] = df.GrLivArea / df.LotArea
X_1["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
X_1["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch
2. 与分类数据运算
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
# Multiply
X_2 = X_2.mul(df.GrLivArea, axis=0)
3. 出现频率
X_3 = pd.DataFrame()
X_3["PorchTypes"] = df[[
"WoodDeckSF",
"OpenPorchSF",
"EnclosedPorch",
"Threeseasonporch",
"ScreenPorch",
]].gt(0.0).sum(axis=1)
4. 分解提取分类特征
#先观察对应离散非数值数据
df.MSSubClass.unique()
X_4 = pd.DataFrame()
#提取出第一个'_'前的字符串
X_4["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]
5. 提取分组特征
X_5 = pd.DataFrame()
#Other handy methods include max, min,mean, median, var, std, and count
X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
6. k-means聚类特征
from sklearn.cluster import KMeans
# Define a list of the features to be used for the clustering
features = []
# Standardize
X_scaled = X.loc[:, features]
X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
#聚类标签特征
kmeans = KMeans(n_clusters=10, random_state=0)
X["Cluster"] = kmeans.fit_predict(X_scaled)
#类别离各中心的距离特征
kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
# Create the cluster-distance features using `fit_transform`
X_cd = kmeans.fit_transform(X_scaled)
# Label features and join to dataset
X_cd = pd.DataFrame(X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])])
7. 主成分分析(降维特征)
#PCA
from sklearn.decomposition import PCA
def apply_pca(X, standardize=True):
# Standardize
if standardize:
X = (X - X.mean(axis=0)) / X.std(axis=0)
# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X)
# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
# Create loadings
loadings = pd.DataFrame(
pca.components_.T, # transpose the matrix of loadings
columns=component_names, # so the columns are the principal components
index=X.columns, # and the rows are the original features
)
return pca, X_pca, loadings
6. 合并
X_new = X.join([X_1, X_2, X_3, X_4, X_5])
X = X.join(X_cd)
X = X.join(X_pca)
```