kaggle入门 随机森林求解Titanic

# kaggle Titanic
# 导入需要的库
import pandas as pd
import numpy as np
import sys
import sklearn
import random
import time

from sklearn import ensemble
from sklearn.preprocessing import LabelEncoder

from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# 开始数据分析
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

data_all = [data_train, data_test]

# 测试一下
# print(data_all)

# 将数据集列名都换成小写字母
for data in data_all:
    data.columns = data.columns.str.lower()

# 测试一下
# print(data_all)

# 进行数据清洗
# print(data_all[0].isnull().sum())  # 看出age有177空缺 cabin有687空缺 embarked有2空缺
# print(data_all[1].isnull().sum())

# 针对空缺进行补足或删除属性
for data in data_all:
    data['age'].fillna(data['age'].median(), inplace=True)
    data['fare'].fillna(data['fare'].median(), inplace=True)
    data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)  # mode按出现频率顺序返回取值

drop_columns = ['cabin', 'passengerid', 'ticket']
for data in data_all:
    data.drop(drop_columns, axis=1, inplace=True)

# 测一下试
# print(data_train.isnull().sum())

for data in data_all:
    data['family_size'] = data['sibsp'] + data['parch'] + 1
    data['single'] = 1
    data['single'].loc[data['family_size'] > 1] = 0
    data['title'] = data['name'].apply(lambda x: x.split(', ')[1]).apply(lambda x:
                                                                         x.split('.')[0])
    data['fare_bin'] = pd.cut(data['fare'], 4)
    data['age_bin'] = pd.cut(data['age'], 5)

# 测试一下
# data_train.to_csv('data/my_train.csv')
# data_test.to_csv('data/my_test.csv')

data_train['title'] = data_train['title'].apply(lambda x: 'other' if data_train['title'].value_counts()[x] < 10 else x)
data_test['title'] = data_test['title'].apply(lambda x: 'other' if data_test['title'].value_counts()[x] < 10 else x)
# 测试一下
# print(data_train['title'].value_counts())

label = LabelEncoder()
for data in data_all:
    data['sex_code'] = label.fit_transform(data['sex'])
    data['embarked_code'] = label.fit_transform(data['embarked'])
    data['title_code'] = label.fit_transform(data['title'])
    data['age_bin_code'] = label.fit_transform(data['age_bin'])
    data['fare_bin_code'] = label.fit_transform(data['fare_bin'])


target = ['survived']
features = ['pclass', 'family_size', 'single', 'sex_code', 'embarked_code',
            'title_code', 'age_bin_code', 'fare_bin_code']

features_all = target + features
data_dummy = pd.get_dummies(data_train[features])
# data_dummy.to_csv('data/dummy.csv')
# print(data_dummy)

# 获取训练集合测试集

x_train, x_test, y_train, y_test = model_selection.train_test_split(data_dummy[features],
                                                                    data_train[target],
                                                                    random_state=0)
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features='auto',
                            random_state=1,
                            n_jobs=-1)  # 最大特征数:自动适应; 结果对照组 1; 是否多线程训练 -1 是

param_gird = {
    'criterion': ['gini', 'entropy'],  # 基尼系数 信息熵,自动对比两个参数的效果挑选最好的
    'min_samples_leaf': [1, 5, 10],  # 最小的叶子节点保留数
    'min_samples_split': [2, 4, 10, 16],  # 最小分几类
    'n_estimators': [50, 100, 400, 700, 1000]

}  # 网格搜索,自动搜索参数中的各种可能挑选效果最好的
# 创建一个网格搜索对象
gs = GridSearchCV(estimator=rf,  # 传入随机森林对象
                  param_grid=param_gird,  # 各种参数
                  scoring='accuracy',  # 评判标准:准确度
                  cv=3,  # 交叉验证
                  n_jobs=-1  # 线程数
                  )
gs = gs.fit(x_train, np.ravel(y_train))

print(gs.best_score_)
print(gs.best_params_)

rf2 = RandomForestClassifier(criterion='entropy',
                             min_samples_leaf=5,
                             min_samples_split=16,
                             n_estimators=50,
                             n_jobs=-1,
                             random_state=1)
rf2.fit(x_train, np.ravel(y_train))

pred = rf2.predict(x_test)
pred_df = pd.DataFrame(pred, columns=['survived'])

pred_df.to_csv('data/pred_df.csv')

 

posted @ 2021-10-10 21:33  茶柒每天要学习  阅读(65)  评论(0编辑  收藏  举报