数据挖掘竞赛常用代码段
常用库
import gc
import os
import csv
import time
import math
import datetime
import collections
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook, trange
from sklearn import preprocessing
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
降低内存
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
常用统计特征
一阶
tmp_df = use_Mb_info_n.groupby("user_id").agg({
"x1": ["sum","max","min","mean"],
"x2": ["count","nunique"],
"x3":["nunique"],
"x4":["sum"]
})
tmp_df.columns = ['_'.join(str(xx) for xx in x) for x in tmp_df.columns.ravel()]
tmp_df = tmp_df.reset_index()
feature_df = pd.merge(feature_df, tmp_df, how='left', on='user_id')
二阶
tmp_df = train_data.groupby(["user_id", "date"]).agg({
"x1": ["sum", "mean", "max", "skew", pd.DataFrame.kurt],
"x2": ["nunique"],
"x3": ["sum"],
"x4": ["sum"]
})
tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
tmp_df = tmp_df.unstack(level=-1)
tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
tmp_df = tmp_df.reset_index()
feature_df = pd.merge(feature_df, tmp_df, how='left', on='uid')
画图
协方差
def correlation_heatmap(df):
_ , ax = plt.subplots(figsize =(20, 20))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
b = sns.heatmap(
df.corr(),
cmap = colormap,
square=True,
cbar_kws={'shrink':.9 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':30 },
)
b.set_xticklabels(b.get_xmajorticklabels(), fontsize = 30)
b.set_yticklabels(b.get_ymajorticklabels(), fontsize = 30)
# b.set_xlabel("X Label",fontsize=30)
# b.set_ylabel("Y Label",fontsize=30)
# plt.title('Pearson Correlation of Features', y=1.05, size=40)
plt.savefig("./cor.png")
correlation_heatmap(df)
正常显示中文
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
大小
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 200
count条状图
sns.countplot(y="店铺名称", data=df, color="c")
训练
lgb
n_splits = 5 # 分为5折
seed = 19950115 # 随机种子
gbm=None
# lgb 参数
lgb_params = {
"learning_rate": 0.005,
"lambda_l2": 0.15,
"max_depth": 9,
"objective": "binary",
"verbose": -1,
# 'feature_fraction': 0.9,
# "min_split_gain": 0.1,
"boosting_type": "gbdt",
"subsample": 0.75,
"colsample_bytree": 0.75,
# "colsample_bylevel": 0.9,
"scale_pos_weight": 16,
'metric': ['auc'], # 评估函数
}
df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
label = data['label']
predictions = 0
feature_importance_df = pd.DataFrame()
skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
print("fold {}".format(fold_))
trn_data = lgb.Dataset(data.iloc[trn_idx][df_train_columns], label=label.iloc[trn_idx])
val_data = lgb.Dataset(data.iloc[val_idx][df_train_columns], label=label.iloc[val_idx])
gbm = lgb.train(lgb_params,
trn_data,
# init_model=gbm,
num_boost_round=150000,
valid_sets=[trn_data, val_data],
early_stopping_rounds=200,
verbose_eval=200) # 训练
# clf = joblib.load("model/lgb_{}.m".format(index)) # 保存模型
# joblib.dump(clf, "model/lgb_{}.m".format(index)) # 加载模型
# gbm.save_model(MODEL_PATH+'/lgb_more_fea.model', num_iteration=gbm.best_iteration)
y_pred = gbm.predict(data.iloc[val_idx][df_train_columns], num_iteration=gbm.best_iteration)
# qauc_score = qauc(y_pred, data.iloc[val_idx][df_train_columns], label.iloc[val_idx])
# print("qauc: ", qauc_score)
# y_score.append(qauc_score) # 计算auc值
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = df_train_columns
fold_importance_df["importance"] = gbm.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions += y_pred.T[0] / skf.n_splits
画特征重要性
cols = (feature_importance_df[["Feature", "importance"]]
.groupby("Feature")
.mean()
.sort_values(by="importance", ascending=False)[:1000].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
plt.figure(figsize=(14,26))
sns.barplot(x="importance",
y="Feature",
data=best_features.sort_values(by="importance",
ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
画树
fig, ax = plt.subplots(figsize=(100, 100))
lgb.plot_tree(gbm, ax=ax)
xgb
xgb1 = xgb.XGBClassifier(max_depth=9,
learning_rate=0.005,
n_estimators=10000,
colsample_bytree=0.75,
sub_sample=0.75,
reg_lambda=0.15,
n_jobs=4,
random_state=3,
scale_pos_weight = 16)
df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
label = data['label']
n_splits = 5 # 分为5折
seed = 19950115 # 随机种子
skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
print("fold {}".format(fold_))
X_train = data.iloc[trn_idx][df_train_columns]
y_train = label.iloc[trn_idx]
X_valid = data.iloc[val_idx][df_train_columns]
y_valid = label.iloc[val_idx]
watchlist = [(X_train,y_train),(X_valid,y_valid)]
xbm = xgb1.fit(
X=X_train,
y=y_train,
eval_set = watchlist,
early_stopping_rounds=200,
verbose =100,
eval_metric='auc',
) # 训练
画特征重要性
fig, ax = plt.subplots(figsize=(14, 26))
xgb.plot_importance(xbm, ax=ax, height=0.3)
画树
xgb.plot_tree(clf, num_trees=0, fmap='xgb.fmap')
fig = plt.gcf()
fig.set_size_inches(150, 100)
plt.show()