kaggle初探之titanic

环境部署

环境部署需要安装python,这里已经配置好,略过

首先登陆kaggle 下载titanic数据

https://www.kaggle.com/c/titanic/data
点击Download ALL

查看数据

  • gender
  • test.csv
  • train.csv

开始建模

import pandas as pd
import os
from sklearn.feature_extraction import DictVectorizer
#导入随机森林
from sklearn.ensemble import RandomForestClassifier
#导入xgboost并初始化
from xgboost import  XGBClassifier
#使用网格搜寻的方法寻找更好的超参数
from sklearn.model_selection import GridSearchCV


os.chdir("/media/sdc/yueyao/Test/jupyter")
#读入数据
train = pd.read_csv("Datasets/train.csv")
test = pd.read_csv("Datasets/test.csv")

#选择预测有效的特征
selected_features = ['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare']
X_train = train[selected_features]
X_test = test[selected_features]
y_train = train['Survived']

#  Embarked 存在缺失,选择出现频率最高的进行填充
X_train['Embarked'].fillna('S',inplace=True)
X_test['Embarked'].fillna('S',inplace=True)

# 对于Age使用平均值或者中位数进行填充NA
X_train['Age'].fillna(X_train['Age'].mean(),inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(),inplace=True)
X_test['Fare'].fillna(X_test['Fare'].mean(),inplace=True)

#对特征进行向量化
dict_vec = DictVectorizer(sparse = False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.fit_transform(X_test.to_dict(orient='record'))


#初始化
rfc = RandomForestClassifier()
xgbc = XGBClassifier()

#使用5折交叉验证的方法
from sklearn.model_selection import cross_val_score
cross_val_score(rfc,X_train,y_train,cv=5).mean()
cross_val_score(xgbc,X_train,y_train,cv=5).mean()

#使用RandomForestClassifier进行预测
rfc.fit(X_train,y_train)
#使用训练的模型进行预测
rfc_y_predict = rfc.predict(X_test)

#rfc预测的结果转化成数据框
rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':rfc_y_predict})
#保存数据
rfc_submission.to_csv("Datasets/rfc_submission.csv",index=False)

xgbc.fit(X_train,y_train)
#使用XGBC进行预测
xgbc_predict = xgbc.predict(X_test)
#生成数据框
xgbc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_predict})
#保存数据
xgbc_submission.to_csv("Datasets/xgbc_submission.csv",index=False)


params={
    'max_depth':range(2,7),
    'n_estimators':range(100,1100,200),
    'learning_rate':[0.05,0.1,0.25,0.5,1.0]
}
xgbc_best = XGBClassifier()
gs = GridSearchCV(xgbc_best,params,n_jobs=-1,cv=5,verbose=1)
gs.fit(X_train,y_train)

#检查优化后的超参数配置
gs.best_score_
gs.best_params_


xgbc_best_y_predict = gs.predict(X_test)
xgbc_best_submission = pd.DataFrame({
    'PassengerId':test['PassengerId'],
    'Survived':xgbc_best_y_predict
})
xgbc_best_submission.to_csv("Datasets/xgbc_best_submission.csv",index=False)

结果的提交

  • 使用api进行提交,首先需要下载一个json格式的钥匙,放在家目录.kaggle/
mv kaggle.json /home/yueyao/.kaggle/

#安装kaggle
python -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple kaggle

#使用kaggle提交
kaggle competitions submit -c titanic -f xgbc_submission.csv -m "xgbc_predict"

我的提交里面会有得分

posted @ 2020-01-21 11:47  raisok  阅读(564)  评论(0编辑  收藏  举报