import pandas as pd
import matplotlib.pyplot as plt
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")
# 检查缺失值
# 观察到有三个类的数据有缺失
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
# 首先处理年纪,采用平均值填充
train_data['Age'] = train_data['Age'].fillna(round(train_data['Age'].mean()))
# 处理Cabin,缺失值过多选择drop此属性
train_data = train_data.drop(columns='Cabin')
# 缺失值较少的非数值类型,采用众数填充
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Embarked 0
dtype: int64
# 对于测试集同样做此操作
test_data['Age'] = test_data['Age'].fillna(round(test_data['Age'].mean()))
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mode()[0])
test_data = test_data.drop(columns='Cabin')
# 观察数据,发现passengerId起到的只是主键,并无任何隐藏含义,故不将其作为模型训练参数
# ticket, Name 只显示了一些无具体意义的数字或者订票公司信息,不纳入考虑
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S |
# 自定义一个用于可视化的函数,分别显示该属性与生存率的关系,一个以数值形式,一个依比例
def showSurvivalUnderAttr(data, attribute):
plt.figure(figsize=(10, 5))
survive = data.loc[data['Survived'] == 1, attribute].value_counts()
die = data.loc[data['Survived'] == 0, attribute].value_counts()
df_survival = pd.DataFrame({'Survive': survive, 'Die': die})
plot1 = df_survival.plot(y=['Survive', 'Die'], kind='bar', ax=plt.subplot(121))
population = data[attribute].value_counts()
df_survive_rate = survive / population
print('survival rate:')
plot2 = df_survive_rate.plot(kind='bar', ax=plt.subplot(122), color=['g', 'b', 'r'], ylim=(0, 1))
plot2.set_ylabel('survival rate')
# Pclass属性 船舱等级
showSurvivalUnderAttr(train_data, 'Pclass')
survival rate:
1 0.629630
2 0.472826
3 0.242363
Name: Pclass, dtype: float64
# Sex 性别
showSurvivalUnderAttr(train_data, 'Sex')
survival rate:
female 0.742038
male 0.188908
Name: Sex, dtype: float64
# Age
# Divide age into 5 bins
plt.hist(train_data['Age'], bins=10)
# 对Age进行数据分割 查看存活率和Age的关系
def processAge(data):
bins = [0, 5, 20, 30, 50, int(max(data['Age']))]
labels = ['Baby', 'Teenager', 'Midlife', 'Prime', 'Older']
data['Age_cut'] = pd.cut(data['Age'], bins=bins, labels=labels)
return data
train_data = processAge(train_data)
showSurvivalUnderAttr(train_data, 'Age_cut')
survival rate:
Baby 0.704545
Teenager 0.377778
Midlife 0.334152
Prime 0.423237
Older 0.343750
Name: Age_cut, dtype: float64
分割结果如上图 所示,可以看到不同年纪阶段,存活率存在较大差异。
def processParchAndSibSp(data):
num_family = data['Parch'] + data['SibSp'] + 1
bins = [0, 3, 6, int(max(num_family))]
labels = ['small', 'middle', 'big']
data['Family'] = pd.cut(num_family, bins=bins, labels=labels)
return data
train_data = processParchAndSibSp(train_data)
showSurvivalUnderAttr(train_data, 'Family')
survival rate:
small 0.388750
middle 0.409091
big 0.160000
Name: Family, dtype: float64
import scipy.stats as stats
print(stats.pearsonr(train_data['Pclass'], train_data['Fare']))
(-0.5494996199439074, 1.96738617342106e-71)
# Embarked 港口属性
showSurvivalUnderAttr(train_data, 'Embarked')
survival rate:
S 0.339009
C 0.553571
Q 0.389610
Name: Embarked, dtype: float64
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Embarked | Age_cut | Family | |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | Midlife | small |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | Prime | small |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | Midlife | small |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | Prime | small |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | Prime | small |
剔除掉我们之前所说的不需要的属性:['PassengerId', 'Name', 'Age', 'Parch', 'Ticket', 'Fare', 'SibSp']
同时由于刚才转换和原有的属性中都存在非数值的类别属性:['Age_cut', 'Embarked', 'Pclass', 'Sex', 'Family']
# one-hot
def transformCategoryToValue(data, attributes):
for attribute in attributes:
tmpDf = pd.get_dummies(data[attribute], prefix=attribute)
data = pd.concat([data, tmpDf], axis=1)
data.drop(attribute, axis=1, inplace=True)
return data
# drop attribute
def dropAttr(data, attributes):
for attribute in attributes:
data.drop(attribute, axis=1, inplace=True)
return data
one_hot_list = ['Age_cut', 'Embarked', 'Pclass', 'Sex', 'Family']
drop_list = ['PassengerId', 'Name', 'Age', 'Parch', 'Ticket', 'Fare', 'SibSp']
train_data = dropAttr(train_data, drop_list)
train_data = transformCategoryToValue(train_data, one_hot_list)
test_id = test_data['PassengerId'] #用于保留PassengerId便于之后的结果提交
test_data = dropAttr(test_data, drop_list)
test_data = transformCategoryToValue(test_data, one_hot_list)
Survived | Age_cut_Baby | Age_cut_Teenager | Age_cut_Midlife | Age_cut_Prime | Age_cut_Older | Embarked_C | Embarked_Q | Embarked_S | Pclass_1 | Pclass_2 | Pclass_3 | Sex_female | Sex_male | Family_small | Family_middle | Family_big | |
0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
Y_train = train_data['Survived']
X_train = train_data.drop('Survived', axis=1)
X_test = test_data
def modelSelection():
models = [LogisticRegression(), Perceptron(), SGDClassifier(), SVC(), KNeighborsClassifier(), GaussianNB(),
DecisionTreeClassifier(), RandomForestClassifier(n_estimators=100)]
modelsName = ['LogisticRegression', 'Perceptron', 'SGDClassifier', 'SVC', 'KNeighborsClassifier', 'GaussianNB',
'DecisionTreeClassifier', 'RandomForestClassifier']
bestModelName = ''
bestScore = 0
for i in range(len(models)):
tmpModel = models[i]
modelName = modelsName[i]
tmpModel.fit(X_train, Y_train)
Y_pred = tmpModel.predict(X_train)
score = tmpModel.score(X_train, Y_train)
print(modelName + ':', score)
bestScore = 0
if score > bestScore:
bestScore = score
bestModel = models[i]
bestModelName = modelName
print('BestModel is ' + bestModelName + ', score is ' + str(bestScore))
return bestModel
bestModel = modelSelection()
LogisticRegression: 0.8125701459034792
Perceptron: 0.7441077441077442
SGDClassifier: 0.7890011223344556
SVC: 0.835016835016835
KNeighborsClassifier: 0.8204264870931538
GaussianNB: 0.7901234567901234
DecisionTreeClassifier: 0.8406285072951739
RandomForestClassifier: 0.8406285072951739
BestModel is RandomForestClassifier, score is 0.8406285072951739
/Users/usyun/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:493: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names unseen at fit time:
- Embarked
- Pclass
- Sex
Feature names seen at fit time, yet now missing:
- Age_cut_Baby
- Age_cut_Midlife
- Age_cut_Older
- Age_cut_Prime
- Age_cut_Teenager
- ...
warnings.warn(message, FutureWarning)
ValueError Traceback (most recent call last)
Input In [87], in <cell line: 38>()
34 return bestModel
37 bestModel = modelSelection()
---> 38 Y_pred = bestModel.predict(X_test)
39 submission = pd.DataFrame({
40 'PassengerId': test_id,
41 'Survived': Y_pred
42 })
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:808, in ForestClassifier.predict(self, X)
787 def predict(self, X):
788 """
789 Predict class for X.
806 The predicted classes.
807 """
--> 808 proba = self.predict_proba(X)
810 if self.n_outputs_ == 1:
811 return self.classes_.take(np.argmax(proba, axis=1), axis=0)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:850, in ForestClassifier.predict_proba(self, X)
848 check_is_fitted(self)
849 # Check data
--> 850 X = self._validate_X_predict(X)
852 # Assign chunk of trees to jobs
853 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py:579, in BaseForest._validate_X_predict(self, X)
576 """
577 Validate X whenever one tries to predict, apply, predict_proba."""
578 check_is_fitted(self)
--> 579 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
580 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
581 raise ValueError("No support for np.int64 index based sparse matrices")
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:566, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:746, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
749 "Complex data not supported\n{}\n".format(array)
750 ) from complex_warning
File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py:2064, in NDFrame.__array__(self, dtype)
2063 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-> 2064 return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'male'
Y_pred = bestModel.predict(X_test)
submission.to_csv('./submission.csv', index=False)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义