kaggle入门 随机森林求解Titanic
# kaggle Titanic # 导入需要的库 import pandas as pd import numpy as np import sys import sklearn import random import time from sklearn import ensemble from sklearn.preprocessing import LabelEncoder from sklearn import feature_selection from sklearn import model_selection from sklearn import metrics import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns # 开始数据分析 data_train = pd.read_csv('data/train.csv') data_test = pd.read_csv('data/test.csv') data_all = [data_train, data_test] # 测试一下 # print(data_all) # 将数据集列名都换成小写字母 for data in data_all: data.columns = data.columns.str.lower() # 测试一下 # print(data_all) # 进行数据清洗 # print(data_all[0].isnull().sum()) # 看出age有177空缺 cabin有687空缺 embarked有2空缺 # print(data_all[1].isnull().sum()) # 针对空缺进行补足或删除属性 for data in data_all: data['age'].fillna(data['age'].median(), inplace=True) data['fare'].fillna(data['fare'].median(), inplace=True) data['embarked'].fillna(data['embarked'].mode()[0], inplace=True) # mode按出现频率顺序返回取值 drop_columns = ['cabin', 'passengerid', 'ticket'] for data in data_all: data.drop(drop_columns, axis=1, inplace=True) # 测一下试 # print(data_train.isnull().sum()) for data in data_all: data['family_size'] = data['sibsp'] + data['parch'] + 1 data['single'] = 1 data['single'].loc[data['family_size'] > 1] = 0 data['title'] = data['name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('.')[0]) data['fare_bin'] = pd.cut(data['fare'], 4) data['age_bin'] = pd.cut(data['age'], 5) # 测试一下 # data_train.to_csv('data/my_train.csv') # data_test.to_csv('data/my_test.csv') data_train['title'] = data_train['title'].apply(lambda x: 'other' if data_train['title'].value_counts()[x] < 10 else x) data_test['title'] = data_test['title'].apply(lambda x: 'other' if data_test['title'].value_counts()[x] < 10 else x) # 测试一下 # print(data_train['title'].value_counts()) label = LabelEncoder() for data in data_all: data['sex_code'] = label.fit_transform(data['sex']) data['embarked_code'] = label.fit_transform(data['embarked']) data['title_code'] = label.fit_transform(data['title']) data['age_bin_code'] = label.fit_transform(data['age_bin']) data['fare_bin_code'] = label.fit_transform(data['fare_bin']) target = ['survived'] features = ['pclass', 'family_size', 'single', 'sex_code', 'embarked_code', 'title_code', 'age_bin_code', 'fare_bin_code'] features_all = target + features data_dummy = pd.get_dummies(data_train[features]) # data_dummy.to_csv('data/dummy.csv') # print(data_dummy) # 获取训练集合测试集 x_train, x_test, y_train, y_test = model_selection.train_test_split(data_dummy[features], data_train[target], random_state=0) from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(max_features='auto', random_state=1, n_jobs=-1) # 最大特征数:自动适应; 结果对照组 1; 是否多线程训练 -1 是 param_gird = { 'criterion': ['gini', 'entropy'], # 基尼系数 信息熵,自动对比两个参数的效果挑选最好的 'min_samples_leaf': [1, 5, 10], # 最小的叶子节点保留数 'min_samples_split': [2, 4, 10, 16], # 最小分几类 'n_estimators': [50, 100, 400, 700, 1000] } # 网格搜索,自动搜索参数中的各种可能挑选效果最好的 # 创建一个网格搜索对象 gs = GridSearchCV(estimator=rf, # 传入随机森林对象 param_grid=param_gird, # 各种参数 scoring='accuracy', # 评判标准:准确度 cv=3, # 交叉验证 n_jobs=-1 # 线程数 ) gs = gs.fit(x_train, np.ravel(y_train)) print(gs.best_score_) print(gs.best_params_) rf2 = RandomForestClassifier(criterion='entropy', min_samples_leaf=5, min_samples_split=16, n_estimators=50, n_jobs=-1, random_state=1) rf2.fit(x_train, np.ravel(y_train)) pred = rf2.predict(x_test) pred_df = pd.DataFrame(pred, columns=['survived']) pred_df.to_csv('data/pred_df.csv')