泰坦尼克号

  数据分析领域都有一个经典的入门题目,泰坦尼克号生还者预测。数据集可以去kaggle下载。

  

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

titanic = pd.read_csv('D:/train.csv')
# print(titanic.head())
# print(titanic.describe())

#通过describe发现age数据不完整,需要填充
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
# print(titanic.describe())

# print(titanic['Sex'].unique())
#将字符值转换成数值
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1

#登船地址
# print(titanic['Embarked'].unique())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2



#用线性回归来预测
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

alg = LinearRegression()
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

#预测结果
predictions = []
#训练集,测试集,交叉验证
for train, test in kf:
    train_predictors = (titanic[predictors].iloc[train, :])
    train_target = titanic['Survived'].iloc[train]
    #训练数据的X,Y => 让他进行判断
    alg.fit(train_predictors, train_target)
    test_predictions  = alg.predict(titanic[predictors].iloc[test, :])
    predictions.append(test_predictions)


predictions = np.concatenate(predictions, axis=0)

predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0

#进行模型评估
accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)
# print(accuracy)


#用逻辑回归来预测
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
# print(scores.mean())



#用随机森林来做
from sklearn.ensemble import RandomForestClassifier

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=8, min_samples_leaf=8)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())



#关于特征提取问题 (非常关键)
#尽可能多的提取特征
#看不同特征的效果
#特征提取是数据挖掘里很- 要的一部分
#以上使用的特征都是数据里已经有的了,在真实的数据挖掘里我们常常没有合适的特征,需要我们自己取提取

#合并数据:自己生成一个特征,家庭成员的大小:兄弟姐妹+老人孩子
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
#名字的长度
titanic['NameLength'] = titanic['Name'].apply(lambda x : len(x))

import re

def get_title(name):
    #使用正则表达式匹配出Mr等
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

titles = titanic['Name'].apply(get_title)
# print(pd.value_counts(titles))

#国外不同阶层的人都有不同的称呼
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8,
                 "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 }
for k, v in title_mapping.items():
    #将不同的称呼替换成机器可以计算的数字
    titles[titles == k] = v

print(pd.value_counts(titles))

titanic['Title'] = titles

# 进行特征选择
# 特征重要性分析
# 分析 不同特征对 最终结果的影响
# 例如 衡量age列的重要程度时,什么也不干,得到一个错误率error1,
# 加入一些噪音数据,替换原来的值(注意,此时其他列的数据不变),又得到一个一个错误率error2
# 两个错误率的差值 可以体现这一个特征的重要性
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pylab as plt

# 选中一些特征
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', "Embarked",
             'FamilySize', 'Title', 'NameLength']

#选择特征
selector = SelectKBest(f_classif, k = 5)
selector.fit(titanic[predictors], titanic['Survived'])

scores= -np.log10(selector.pvalues_)

plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation = 'vertical')
# plt.show()

#通过以上特征重要性分析,选择出4个最重要的特征,重新进行随机森林的算法
predictors = ['Pclass', 'Sex', 'Fare', 'Title']
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=8)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
print(scores.mean())


#可以使用多种算法一起来训练,最后取平均值防止过拟合
from sklearn.ensemble import GradientBoostingClassifier

algorithms = [
    [LogisticRegression(random_state=1),
     ['Pclass', 'Sex', 'Fare', 'Title']],
    [RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=8, min_samples_leaf=8), ['Pclass', 'Sex', 'Fare', 'Title']]
]

# Initialize the cross validation folds
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    train_target = titanic['Survived'].iloc[train]
    full_test_predictions = []
    # Make predictions for each algorithm on each folds
    for alg, predictors in algorithms:
        # Fit the algorithm on the training data.
        alg.fit(titanic[predictors].iloc[train, :], train_target)
        # Select and predict on the test fold.
        # The astype(float) is necessary to convert the dataframe
        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]
        full_test_predictions.append(test_predictions)
    # Use a simple ensembling scheme - just average the predictions to get the final classification
    # 两个算法, 分别算出来的 预测值, 取平均
    test_predictions = (full_test_predictions[0] * 3 + full_test_predictions[1]) / 2
    # Any value over 5 is assumed to be a 1 prediction, and below 5 is a 0 prediction
    test_predictions[test_predictions <= 0.5] = 0
    test_predictions[test_predictions > .5] = 1
    predictions.append(test_predictions)

# Put all the predictions together into one array
predictions = np.concatenate(predictions, axis=0)

accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)
print(accuracy)

 

posted @ 2018-03-25 13:06  SHUXU  阅读(392)  评论(0编辑  收藏  举报