【项目实战】天猫重复购买预测 特征工程
工具导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import gc
from collections import Counter
import copy
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
数据读取
读取训练集和测试集以及用户信息,用户日志的数据
#读取数据集
#test_data = pd.read_csv('./data_format1/test_format1.csv')
#train_data = pd.read_csv('./data_format1/train_format1.csv')
#user_info = pd.read_csv('./data_format1/user_info_format1.csv')
#user_log = pd.read_csv('./data_format1/user_log_format1.csv')
数据读取函数
def read_csv(file_name, num_rows):
return pd.read_csv(file_name, nrows=num_rows)
内存压缩方法
# reduce memory
def reduce_mem_usage(df, verbose=True):
start_mem = df.memory_usage().sum() / 1024**2
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
数据进行内存压缩
num_rows = None
num_rows = 200 * 10000 # 1000条测试代码使用
# num_rows = 1000
train_file = './data_format1/train_format1.csv'
test_file = './data_format1/test_format1.csv'
user_info_file = './data_format1/user_info_format1.csv'
user_log_file = './data_format1/user_log_format1.csv'
train_data = reduce_mem_usage(read_csv(train_file, num_rows))
test_data = reduce_mem_usage(read_csv(test_file, num_rows))
user_info = reduce_mem_usage(read_csv(user_info_file, num_rows))
user_log = reduce_mem_usage(read_csv(user_log_file, num_rows))
压缩效果
Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage after optimization is: 32.43 MB
Decreased by 69.6%
查看压缩后的数据信息
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 260864 non-null int32
1 merchant_id 260864 non-null int16
2 label 260864 non-null int8
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 261477 non-null int32
1 merchant_id 261477 non-null int16
2 prob 0 non-null float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB
user_info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 424170 non-null int32
1 age_range 421953 non-null float16
2 gender 417734 non-null float16
dtypes: float16(2), int32(1)
memory usage: 3.2 MB
user_log.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
# Column Dtype
--- ------ -----
0 user_id int32
1 item_id int32
2 cat_id int16
3 seller_id int16
4 brand_id float16
5 time_stamp int16
6 action_type int8
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 32.4 MB
数据处理
合并用户信息
del test_data['prob']
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info,on=['user_id'],how='left')
del train_data, test_data, user_info
gc.collect()
user_id | merchant_id | label | age_range | gender |
---|---|---|---|---|
0 | 34176 | 3906 | 0.0 | 6.0 |
1 | 34176 | 121 | 0.0 | 6.0 |
2 | 34176 | 4356 | 1.0 | 6.0 |
3 | 34176 | 2217 | 0.0 | 6.0 |
4 | 230784 | 4818 | 0.0 | 0.0 |
用户行为日志信息按时间进行排序
user_log = user_log.sort_values(['user_id','time_stamp'])
user_id | item_id | cat_id | seller_id | brand_id | time_stamp | action_type |
---|---|---|---|---|---|---|
61975 | 16 | 980982 | 437 | 650 | 4276.0 | 914 |
61976 | 16 | 980982 | 437 | 650 | 4276.0 | 914 |
61977 | 16 | 980982 | 437 | 650 | 4276.0 | 914 |
61978 | 16 | 962763 | 19 | 650 | 4276.0 | 914 |
61979 | 16 | 391126 | 437 | 650 | 4276.0 | 914 |
对每个用户的逐个合并所有的item_id, cat_id,seller_id,brand_id,time_stamp, action_type字段
list_join_func = lambda x: " ".join([str(i) for i in x])
agg_dict = {
'item_id' : list_join_func,
'cat_id' : list_join_func,
'seller_id' : list_join_func,
'brand_id' : list_join_func,
'time_stamp' : list_join_func,
'action_type' : list_join_func
}
rename_dict = {
'item_id' : 'item_path',
'cat_id' : 'cat_path',
'seller_id' : 'seller_path',
'brand_id' : 'brand_path',
'time_stamp' : 'time_stamp_path',
'action_type' : 'action_type_path'
}
删除数据并回收内存
del user_log
gc.collect()
定义数据统计函数
统计数据的总数
def cnt_(x):
try:
return len(x.split(' ')) //split()通过指定分隔符对字符串进行切片,如果参数有指定值,则分隔参数+1个子字符串
except:
return -1
统计唯一数据总数
def nunique_(x):
try:
return len(set(x.split(' ')))
except:
return -1
统计数据最大值
def max_(x):
try:
return np.max([int(i) for i in x.split(' ')])
except:
return -1
统计数据最小值
统计数据最小值
def min_(x):
try:
return np.min([int(i) for i in x.split(' ')])
except:
return -1
统计数据的标准差
def std_(x):
try:
return np.std([float(i) for i in x.split(' ')])
except:
return -1
统计数据中top N的数据
def most_n(x, n):
try:
return Counter(x.split(' ')).most_common(n)[n-1][0]
except:
return -1
统计数据中top N数据的总数
def most_n_cnt(x, n):
try:
return Counter(x.split(' ')).most_common(n)[n-1][1]
except:
return -1
提取商铺的基本统计特征
all_data_test = all_data_path.head(2000)
#all_data_test = all_data_path
# 统计用户 点击、浏览、加购、购买行为
# 总次数
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')
# 不同店铺个数
all_data_test = user_nunique(all_data_test, 'seller_path', 'seller_nunique')
# 不同品类个数
all_data_test = user_nunique(all_data_test, 'cat_path', 'cat_nunique')
# 不同品牌个数
all_data_test = user_nunique(all_data_test, 'brand_path', 'brand_nunique')
# 不同商品个数
all_data_test = user_nunique(all_data_test, 'item_path', 'item_nunique')
# 活跃天数
all_data_test = user_nunique(all_data_test, 'time_stamp_path', 'time_stamp_nunique')
# 不用行为种数
all_data_test = user_nunique(all_data_test, 'action_type_path', 'action_type_nunique')
# ....
分开统计用户的点击,加购,购买,收藏特征
不同行为的业务函数定义
def col_cnt_(df_data, columns_list, action_type):
try:
data_dict = {}
col_list = copy.deepcopy(columns_list)
if action_type != None:
col_list += ['action_type_path']
for col in col_list:
data_dict[col] = df_data[col].split(' ')
path_len = len(data_dict[col])
data_out = []
for i_ in range(path_len):
data_txt = ''
for col_ in columns_list:
if data_dict['action_type_path'][i_] == action_type:
data_txt += '_' + data_dict[col_][i_]
data_out.append(data_txt)
return len(data_out)
except:
return -1
def col_nuique_(df_data, columns_list, action_type):
try:
data_dict = {}
col_list = copy.deepcopy(columns_list)
if action_type != None:
col_list += ['action_type_path']
for col in col_list:
data_dict[col] = df_data[col].split(' ')
path_len = len(data_dict[col])
data_out = []
for i_ in range(path_len):
data_txt = ''
for col_ in columns_list:
if data_dict['action_type_path'][i_] == action_type:
data_txt += '_' + data_dict[col_][i_]
data_out.append(data_txt)
return len(set(data_out))
except:
return -1
def user_col_cnt(df_data, columns_list, action_type, name):
df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
return df_data
def user_col_nunique(df_data, columns_list, action_type, name):
df_data[name] = df_data.apply(lambda x: col_nuique_(x, columns_list, action_type), axis=1)
return df_data
统计店铺被用户点击次数,加购次数,购买次数,收藏次数
# 点击次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '0', 'user_cnt_0')
# 加购次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '1', 'user_cnt_1')
# 购买次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '2', 'user_cnt_2')
# 收藏次数
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '3', 'user_cnt_3')
# 不同店铺个数
all_data_test = user_col_nunique(all_data_test, ['seller_path'], '0', 'seller_nunique_0')
# ....
组合特征
特征组合进行业务特征提取
# 点击次数
all_data_test = user_col_cnt(all_data_test, ['seller_path', 'item_path'], '0', 'user_cnt_0')
# 不同店铺个数
all_data_test = user_col_nunique(all_data_test, ['seller_path', 'item_path'], '0', 'seller_nunique_0')
查看提取的特征
list(all_data_test.columns)
['user_id',
, 'merchant_id',
, 'label',
, 'age_range',
, 'gender',
, 'item_path',
, 'cat_path',
, 'seller_path',
, 'brand_path',
, 'time_stamp_path',
, 'action_type_path',
, 'user_cnt',
, 'seller_nunique',
, 'cat_nunique',
, 'brand_nunique',
, 'item_nunique',
, 'time_stamp_nunique',
, 'action_type_nunique',
, 'time_stamp_max',
, 'time_stamp_min',
, 'time_stamp_std',
, 'time_stamp_range',
, 'seller_most_1',
, 'cat_most_1',
, 'brand_most_1',
, 'action_type_1',
, 'seller_most_1_cnt',
, 'cat_most_1_cnt',
, 'brand_most_1_cnt',
, 'action_type_1_cnt',
, 'user_cnt_0',
, 'user_cnt_1',
, 'user_cnt_2',
, 'user_cnt_3',
, 'seller_nunique_0']
利用countvector,tfidf提取特征
这里的countvector,tfidf分别有不同的作用
CountVectorizer把一个文档转成一个包含词频的矩阵,而TfidfVectorizer跟CountVectorizer的区别在于:CountVectorizer返回的是词频,TfidfVectorizer返回的是tfidf值
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy import sparse
# cntVec = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)
tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)
# columns_list = ['seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']
columns_list = ['seller_path']
for i, col in enumerate(columns_list):
all_data_test[col] = all_data_test[col].astype(str)
tfidfVec.fit(all_data_test[col])
data_ = tfidfVec.transform(all_data_test[col])
if i == 0:
data_cat = data_
else:
data_cat = sparse.hstack((data_cat, data_))
特征重命名 特征合并
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf],axis=1)
嵌入特征
嵌入法是一种让算法自己决定使用哪些特征的方法,即特征选择和算法训练同时进行。在使用嵌入法时,我们先使用某些机器学习的算法和模型进行训练,得到各个特征的权重系数,根据权值系数从大到小选择特征,这些权值系数往往代表了特征对于模型的某种贡献或某种重要性,比如决策树和树的集成模型中的feature_importances_属性,可以列出各个特征对树建立的贡献,我们可以基于这种贡献的评估,找出对模型建立最有用的特征,因此相对于过滤法,嵌入法的结果会更加精确到模型的效用本身,对提高模型效力有更好的效果,并且,由于考虑特征对模型的贡献,因此无关的特征(需要相关性过滤的特征)和无区分度的特征(需要方差过滤的特征)都会因为缺乏对模型的贡献而被删除掉。
import gensim
# Train Word2Vec model
model = gensim.models.Word2Vec(all_data_test['seller_path'].apply(lambda x: x.split(' ')), size=100, window=5, min_count=5, workers=4)
# model.save("product2vec.model")
# model = gensim.models.Word2Vec.load("product2vec.model")
def mean_w2v_(x, model, size=100):
try:
i = 0
for word in x.split(' '):
if word in model.wv.vocab:
i += 1
if i == 1:
vec = np.zeros(size)
vec += model.wv[word]
return vec / i
except:
return np.zeros(size)
def get_mean_w2v(df_data, columns, model, size):
data_array = []
for index, row in df_data.iterrows():
w2v = mean_w2v_(row[columns], model, size)
data_array.append(w2v)
return pd.DataFrame(data_array)
df_embeeding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embeeding.columns = ['embeeding_' + str(i) for i in df_embeeding.columns]
embeeding特征和原始特征合并
all_data_test = pd.concat([all_data_test, df_embeeding],axis=1)
stacking特征
stacking 回归特征
"""
-- 回归
-- stacking 回归特征
"""
def stacking_reg(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
train=np.zeros((train_x.shape[0],1))
test=np.zeros((test_x.shape[0],1))
test_pre=np.empty((folds,test_x.shape[0],1))
cv_scores=[]
for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)): //把train_x,label_split变成一个索引列表
tr_x=train_x[train_index]
tr_y=train_y[train_index]
te_x=train_x[test_index]
te_y = train_y[test_index]
if clf_name in ["rf","ada","gb","et","lr"]: //如果是随机森林,adaboost,梯度增强回归器 ,极端随机森林,线性回归等算法
clf.fit(tr_x,tr_y)
pre=clf.predict(te_x).reshape(-1,1)
train[test_index]=pre
test_pre[i,:]=clf.predict(test_x).reshape(-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
elif clf_name in ["xgb"]: //如果是XGBOOST算法
train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
z = clf.DMatrix(test_x, label=te_y, missing=-1)
params = {'booster': 'gbtree',
'eval_metric': 'rmse',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.03,
'tree_method': 'exact',
'seed': 2017,
'nthread': 12
}
num_round = 10000
early_stopping_rounds = 100
watchlist = [(train_matrix, 'train'),
(test_matrix, 'eval')
]
if test_matrix:
model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit).reshape(-1,1)
train[test_index]=pre
test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit).reshape(-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
elif clf_name in ["lgb"]: //如果是LGBOOST算法
train_matrix = clf.Dataset(tr_x, label=tr_y)
test_matrix = clf.Dataset(te_x, label=te_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression_l2',
'metric': 'mse',
'min_child_weight': 1.5,
'num_leaves': 2**5,
'lambda_l2': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0.03,
'tree_method': 'exact',
'seed': 2017,
'nthread': 12,
'silent': True,
}
num_round = 10000
early_stopping_rounds = 100
if test_matrix:
model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(te_x,num_iteration=model.best_iteration).reshape(-1,1)
train[test_index]=pre
test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration).reshape(-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
else:
raise IOError("Please add new clf.")
print("%s now score is:"%clf_name,cv_scores)
test[:]=test_pre.mean(axis=0)
print("%s_score_list:"%clf_name,cv_scores)
print("%s_score_mean:"%clf_name,np.mean(cv_scores))
return train.reshape(-1,1),test.reshape(-1,1)
def rf_reg(x_train, y_train, x_valid, kf, label_split=None):
randomforest = RandomForestRegressor(n_estimators=600, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
rf_train, rf_test = stacking_reg(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
return rf_train, rf_test,"rf_reg"
def ada_reg(x_train, y_train, x_valid, kf, label_split=None):
adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01)
ada_train, ada_test = stacking_reg(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
return ada_train, ada_test,"ada_reg"
def gb_reg(x_train, y_train, x_valid, kf, label_split=None):
gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
gbdt_train, gbdt_test = stacking_reg(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
return gbdt_train, gbdt_test,"gb_reg"
def et_reg(x_train, y_train, x_valid, kf, label_split=None):
extratree = ExtraTreesRegressor(n_estimators=600, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
et_train, et_test = stacking_reg(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
return et_train, et_test,"et_reg"
def lr_reg(x_train, y_train, x_valid, kf, label_split=None):
lr_reg=LinearRegression(n_jobs=-1)
lr_train, lr_test = stacking_reg(lr_reg, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
return lr_train, lr_test, "lr_reg"
def xgb_reg(x_train, y_train, x_valid, kf, label_split=None):
xgb_train, xgb_test = stacking_reg(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
return xgb_train, xgb_test,"xgb_reg"
def lgb_reg(x_train, y_train, x_valid, kf, label_split=None):
lgb_train, lgb_test = stacking_reg(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
return lgb_train, lgb_test,"lgb_reg"
stacking 分类特征
"""
-- 分类
-- stacking 分类特征
"""
def stacking_clf(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
train=np.zeros((train_x.shape[0],1))
test=np.zeros((test_x.shape[0],1))
test_pre=np.empty((folds,test_x.shape[0],1))
cv_scores=[]
for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):
tr_x=train_x[train_index]
tr_y=train_y[train_index]
te_x=train_x[test_index]
te_y = train_y[test_index]
if clf_name in ["rf","ada","gb","et","lr","knn","gnb"]:
clf.fit(tr_x,tr_y)
pre=clf.predict_proba(te_x)
train[test_index]=pre[:,0].reshape(-1,1)
test_pre[i,:]=clf.predict_proba(test_x)[:,0].reshape(-1,1)
cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
elif clf_name in ["xgb"]:
train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
z = clf.DMatrix(test_x)
params = {'booster': 'gbtree',
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.03,
'tree_method': 'exact',
'seed': 2017,
"num_class": 2
}
num_round = 10000
early_stopping_rounds = 100
watchlist = [(train_matrix, 'train'),
(test_matrix, 'eval')
]
if test_matrix:
model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
train[test_index]=pre[:,0].reshape(-1,1)
test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit)[:,0].reshape(-1,1)
cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
elif clf_name in ["lgb"]:
train_matrix = clf.Dataset(tr_x, label=tr_y)
test_matrix = clf.Dataset(te_x, label=te_y)
params = {
'boosting_type': 'gbdt',
#'boosting_type': 'dart',
'objective': 'multiclass',
'metric': 'multi_logloss',
'min_child_weight': 1.5,
'num_leaves': 2**5,
'lambda_l2': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0.03,
'tree_method': 'exact',
'seed': 2017,
"num_class": 2,
'silent': True,
}
num_round = 10000
early_stopping_rounds = 100
if test_matrix:
model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(te_x,num_iteration=model.best_iteration)
train[test_index]=pre[:,0].reshape(-1,1)
test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration)[:,0].reshape(-1,1)
cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
else:
raise IOError("Please add new clf.")
print("%s now score is:"%clf_name,cv_scores)
test[:]=test_pre.mean(axis=0)
print("%s_score_list:"%clf_name,cv_scores)
print("%s_score_mean:"%clf_name,np.mean(cv_scores))
return train.reshape(-1,1),test.reshape(-1,1)
def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
randomforest = RandomForestClassifier(n_estimators=1200, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
rf_train, rf_test = stacking_clf(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
return rf_train, rf_test,"rf"
def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
adaboost = AdaBoostClassifier(n_estimators=50, random_state=2017, learning_rate=0.01)
ada_train, ada_test = stacking_clf(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
return ada_train, ada_test,"ada"
def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
gbdt = GradientBoostingClassifier(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
gbdt_train, gbdt_test = stacking_clf(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
return gbdt_train, gbdt_test,"gb"
def et_clf(x_train, y_train, x_valid, kf, label_split=None):
extratree = ExtraTreesClassifier(n_estimators=1200, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
et_train, et_test = stacking_clf(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
return et_train, et_test,"et"
def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
xgb_train, xgb_test = stacking_clf(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
return xgb_train, xgb_test,"xgb"
def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
xgb_train, xgb_test = stacking_clf(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
return xgb_train, xgb_test,"lgb"
def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
gnb=GaussianNB()
gnb_train, gnb_test = stacking_clf(gnb, x_train, y_train, x_valid, "gnb", kf, label_split=label_split)
return gnb_train, gnb_test,"gnb"
def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
logisticregression=LogisticRegression(n_jobs=-1,random_state=2017,C=0.1,max_iter=200)
lr_train, lr_test = stacking_clf(logisticregression, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
return lr_train, lr_test, "lr"
def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
knn_train, knn_test = stacking_clf(kneighbors, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
return knn_train, knn_test, "knn"
获取训练和验证数据(为stacking特征做准备)
features_columns = [c for c in all_data_test.columns if c not in ['label', 'prob', 'seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']]
x_train = all_data_test[~all_data_test['label'].isna()][features_columns].values
y_train = all_data_test[~all_data_test['label'].isna()]['label'].values
x_valid = all_data_test[all_data_test['label'].isna()][features_columns].values
处理函数值inf以及nan情况
def get_matrix(data):
where_are_nan = np.isnan(data)
where_are_inf = np.isinf(data)
data[where_are_nan] = 0
data[where_are_inf] = 0
return data
x_train = np.float_(get_matrix(np.float_(x_train)))
y_train = np.int_(y_train)
x_valid = x_train
导入划分数据函数 设stacking特征为5折
注意:这里五折一开始我没理解,查询资料后得到解释如下
k 折交叉验证通过对 k 个不同分组训练的结果进行平均来减少方差,因此模型的性能对数据的划分就不那么敏感。
- 第一步,不重复抽样将原始数据随机分为 k 份。
- 第二步,每一次挑选其中 1 份作为测试集,剩余 k-1 份作为训练集用于模型训练。
- 第三步,重复第二步 k 次,这样每个子集都有一次机会作为测试集,其余机会作为训练集。在每个训练集上训练后得到一个模型,用这个模型在相应的测试集上测试,计算并保存模型的评估指标,
- 第四步,计算 k 组测试结果的平均值作为模型精度的估计,并作为当前 k 折交叉验证下模型的性能指标。
from sklearn.model_selection import StratifiedKFold, KFold
folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)
使用lgb和xgb分类模型构造stacking特征
clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']
训练模型,获取stacking特征
clf_list = clf_list
column_list = []
train_data_list=[]
test_data_list=[]
for clf in clf_list:
train_data,test_data,clf_name=clf(x_train, y_train, x_valid, kf, label_split=None)
train_data_list.append(train_data)
test_data_list.append(test_data)
train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)
原始特征和stacking特征合并
train = pd.DataFrame(np.concatenate([x_train, train_stacking], axis=1))
test = np.concatenate([x_valid, test_stacking], axis=1)
特征重命名
df_train_all = pd.DataFrame(train)
df_train_all.columns = features_columns + clf_list_col
df_test_all = pd.DataFrame(test)
df_test_all.columns = features_columns + clf_list_col
获取数据ID以及特征标签LABEL
df_train_all['label'] = all_data_test['label']
训练数据和测试数据保存
df_train_all.to_csv('train_all.csv',header=True,index=False)
df_test_all.to_csv('test_all.csv',header=True,index=False)
本文来自博客园,作者:Lugendary,转载请注明原文链接:https://www.cnblogs.com/lugendary/p/16041318.html