机器学习模板
根据心情补充,语言都是Python
hash,把所有的文本转化成数字
from sklearn.preprocessing import LabelEncoder
for c in train.columns:
if train[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train[c].values) + list(test[c].values))
train[c] = lbl.transform(list(train[c].values))
test[c] = lbl.transform(list(test[c].values))
Xgboost训练
'''Train the xgb model then predict the test data'''
xgb_params = {
'n_trees': 520,
'eta': 0.0045,
'max_depth': 4,
'subsample': 0.93,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'base_score': y_mean, # base prediction = mean(target)
'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)
num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)
OneHot矩阵转换
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))
X_cat_sparse=enc.transform(X[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])