泰坦尼克号项目
import pandas as pd
df_train,df_test = pd.read_csv("F:/Python CODE/Kaggle_Titanic/train.csv"),pd.read_csv("F:/Python CODE/Kaggle_Titanic/test.csv")
In [2]:
df_train.head()#查看表格的后5行
Out[2]:
SibSp -- 同船配偶以及兄弟姐妹的人数
Parch -- 同船父母或者子女的人数
Ticket -- 船票
Fare -- 票价
Cabin -- 舱位
Embarked -- 登船港口
In [3]:
df_train.info() #查看数据表的整体信息
In [4]:
df_train.describe() #描述性统计
Out[4]:
In [5]:
df_train[["Name","Sex","Ticket","Cabin","Embarked"]].describe()#对于object类型(python对象)同样用describe()处理
Out[5]:
In [6]:
#特征分析,在11个特征中,找哪些是和幸存相关
import numpy as np
import matplotlib.pyplot as plt
Pclass_Survied = pd.crosstab(df_train['Pclass'],df_train['Survived'])#生成Pclass_Survied的列联表
In [7]:
Pclass_Survied
Out[7]:
In [8]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆积柱形图
plt.show()
In [9]:
Pclass_Survied.count()
Out[9]:
In [10]:
Pclass_Survied.index
Out[10]:
In [11]:
Survied_len = len(Pclass_Survied.count())
Pclass_index = np.arange(len(Pclass_Survied.index))
In [12]:
Pclass_index
Out[12]:
In [13]:
Pclass_Survied
Out[13]:
In [14]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆积柱形图
Sum1 = 0
for i in range(Survied_len):
SurvivedName = Pclass_Survied.columns[i]
PclassCount = Pclass_Survied[SurvivedName]
Sum1,Sum2 = Sum1+PclassCount,Sum1
Zsum =Sum2+(Sum1 - Sum2)/2
for x,y,z in zip(Pclass_index,PclassCount,Zsum):
plt.text(x,z, '%.0f'%y, ha = 'center',va='center' )#添加数据标签
#修改x轴标签
plt.xticks(Pclass_Survied.index-1, Pclass_Survied.index, rotation=360)
plt.title('Survived status by pclass')
plt.show()
In [15]:
a = df_train.Pclass[df_train['Survived']==0].value_counts()
b = df_train.Pclass[df_train['Survived']==1].value_counts()
Pclass_Survived = pd.DataFrame({ 0: a, 1: b})
In [16]:
Pclass_Survived
Out[16]:
In [17]:
import re
df_train['Appellation'] = df_train.Name.apply(lambda x: re.search('\w+\.', x).group()).str.replace('.', '')
df_train.Appellation.unique()
Out[17]:
In [18]:
Application_Sex = pd.crosstab(df_train.Sex,df_train.Appellation)
Application_Sex
Out[18]:
In [19]:
df_train['Appellation'] = df_train['Appellation'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare')
df_train['Appellation'] = df_train['Appellation'].replace(['Mlle','Ms'], 'Miss')
df_train['Appellation'] = df_train['Appellation'].replace('Mme', 'Mrs')
df_train.Appellation.unique()
Out[19]:
In [44]:
Appellation_Survived = pd.crosstab(df_train['Appellation'], df_train['Survived'])
Appellation_Survived.plot(kind = 'bar')
plt.xticks(np.arange(len(Appellation_Survived.index)), Appellation_Survived.index, rotation = 360)
plt.title('Survived status by Appellation')
plt.show()
In [24]:
Sex_Survived = pd.crosstab(df_train['Sex'],df_train['Survived'])
In [45]:
#生成列联表
Sex_Survived = pd.crosstab(df_train['Sex'], df_train['Survived'])
Survived_len = len(Sex_Survived.count())
Sex_index = np.arange(len(Sex_Survived.index))
single_width = 0.35
for i in range(Survived_len):
SurvivedName = Sex_Survived.columns[i]
SexCount = Sex_Survived[SurvivedName]
SexLocation = Sex_index * 1.05 + (i - 1/2)*single_width
#绘制柱形图
plt.bar(SexLocation, SexCount, width = single_width)
for x, y in zip(SexLocation, SexCount):
#添加数据标签
plt.text(x, y, '%.0f'%y, ha='center', va='bottom')
index = Sex_index * 1.05
plt.xticks(index, Sex_Survived.index, rotation=360)
plt.title('Survived status by sex')
plt.show()
In [46]:
SibSp_Survived = pd.crosstab(df_train['SibSp'], df_train['Survived'])
SibSp_Survived.plot(kind = 'bar')
plt.xticks(SibSp_Survived.index,SibSp_Survived.index,rotation = 360)
plt.title('Survived status by SibSp')
plt.show()
In [47]:
SibSp_Survived = pd.crosstab(df_train.SibSp[df_train['SibSp']>2], df_train['Survived'])
SibSp_Survived.plot(kind = 'bar')
plt.xticks([0,1,2,3],SibSp_Survived.index,rotation = 360)
plt.title('Survived status by SibSp')
plt.show()
In [28]:
Ticket_Count = df_train.groupby('Ticket',as_index=False)['PassengerId'].count()
In [29]:
Ticket_Count.head()
Out[29]:
In [30]:
#解释上行代码中的groupg中的as_index=False
df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'], 'price': [12,12,12,15,15,17]})
print(df)
print("*********************")
print (df.groupby('books', as_index=True).sum())
print("*********************")
print (df.groupby('books', as_index=False).sum())
In [31]:
Ticket_Count_0 = Ticket_Count[Ticket_Count.PassengerId == 1]['Ticket']
In [32]:
Ticket_Count_0.head()
Out[32]:
In [33]:
df_train['GroupTicket'] = np.where(df_train.Ticket.isin(Ticket_Count_0),0,1)
In [34]:
GroupTicket_Survived = pd.crosstab(df_train['GroupTicket'],df_train['Survived'])
GroupTicket_Survived.plot(kind='bar')
plt.xticks(rotation =360)
Out[34]:
In [35]:
bins = [0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600]
df_train['GroupFare'] = pd.cut(df_train.Fare,bins,right=False)
GroupFare_Survived = pd.crosstab(df_train['GroupFare'],df_train['Survived'])
GroupFare_Survived.plot(kind = 'bar')
Out[35]:
In [36]:
GroupFare_Survived.iloc[2:].plot(kind = 'bar')
Out[36]:
In [ ]:
#以上所有操作都是对特征中无缺失部分进行分析
#下一步则会在特征工程中对缺失部分进行处理Age、Cabin、Embarked
In [37]:
df_train['Embarked'].mode()
Out[37]:
In [38]:
#df_train['Embarked'].mode()[0] 众数可能有多个,[0]代表取第一个
train = df_train.copy()
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
In [39]:
train['Cabin'] = train['Cabin'].fillna('NO')
In [40]:
Age_Appellation_median = train.groupby('Appellation')['Age'].median()
In [52]:
Age_Appellation_median
Out[52]:
In [59]:
train.set_index('Appellation', inplace = True)
#在当前表填充缺失值
train.Age.fillna(Age_Appellation_median, inplace = True)
#重置索引
train.reset_index(inplace = True)
In [60]:
train
Out[60]:
In [62]:
train.Age.isnull().sum()
Out[62]:
In [64]:
train.Age.isnull().any()
Out[64]:
In [65]:
train.Age.describe()
Out[65]:
In [66]:
Embarked_Survived = pd.crosstab(train['Embarked'],train['Survived'])
In [68]:
Embarked_Survived.plot(kind = 'bar')
plt.xticks(rotation = 360)
plt.title('Survived status by Embarked')
plt.show()
In [69]:
train
Out[69]:
In [80]:
train['GroupCabin'] = np.where(train['Cabin'] == 'NO',0,1)
In [82]:
GroupCabin_Survived = pd.crosstab(train['GroupCabin'],train['Survived'])
GroupCabin_Survived.plot(kind = 'bar')
plt.title('Survived status by GroupCabin')
plt.xticks(rotation=360)
plt.show()
In [86]:
#对Age进行分组: 2**10>891分成10组, 组距为(最大值80-最小值0)/10 =8取9
bins = [0, 9, 18, 27, 36, 45, 54, 63, 72, 81, 90]
train['GroupAge'] = pd.cut(train.Age, bins)
GroupAge_Survived = pd.crosstab(train['GroupAge'], train['Survived'])
GroupAge_Survived.plot(kind = 'bar')
plt.title('Survived status by GroupAge')
plt.show()
In [87]:
train['Appellation'] = train.Appellation.map({'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Rare': 4})
train.Appellation.unique()
Out[87]:
In [89]:
train['Sex'] = train.Sex.map({'female':0,'male':1})
In [90]:
train.head()
Out[90]:
In [95]:
train.loc[train['Age'] < 9, 'Age']=0
train.loc[(train['Age'] >= 9) & (train['Age'] < 18), 'Age'] = 1
train.loc[(train['Age'] >= 18) & (train['Age'] < 27), 'Age'] = 2
train.loc[(train['Age'] >= 27) & (train['Age'] < 36), 'Age'] = 3
train.loc[(train['Age'] >= 36) & (train['Age'] < 45), 'Age'] = 4
train.loc[(train['Age'] >= 45) & (train['Age'] < 54), 'Age'] = 5
train.loc[(train['Age'] >= 54) & (train['Age'] < 63), 'Age'] = 6
train.loc[(train['Age'] >= 63) & (train['Age'] < 72), 'Age'] = 7
train.loc[(train['Age'] >= 72) & (train['Age'] < 81), 'Age'] = 8
train.loc[(train['Age'] >= 81) & (train['Age'] < 90), 'Age'] = 9
train.Age.unique()
Out[95]:
In [96]:
train.head()
Out[96]:
In [97]:
#当SibSp和Parch都为0时, 则孤身一人.
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train.FamilySize.unique()
Out[97]:
In [98]:
train.loc[train['Fare'] < 60, 'Fare'] = 0
train.loc[(train['Fare'] >= 60) & (train['Fare'] < 120), 'Fare'] = 1
train.loc[(train['Fare'] >= 120) & (train['Fare'] < 180), 'Fare'] = 2
train.loc[(train['Fare'] >= 180) & (train['Fare'] < 240), 'Fare'] = 3
train.loc[(train['Fare'] >= 240) & (train['Fare'] < 300), 'Fare'] = 4
train.loc[(train['Fare'] >= 300) & (train['Fare'] < 360), 'Fare'] = 5
train.loc[(train['Fare'] >= 360) & (train['Fare'] < 420), 'Fare'] = 6
train.loc[(train['Fare'] >= 420) & (train['Fare'] < 480), 'Fare'] = 7
train.loc[(train['Fare'] >= 480) & (train['Fare'] < 540), 'Fare'] = 8
train.loc[(train['Fare'] >= 540) & (train['Fare'] < 600), 'Fare'] = 9
train.Fare.unique()
Out[98]:
In [99]:
train['Embarked'] = train.Embarked.map({'S': 0, 'C': 1, 'Q': 2})
In [100]:
train.drop(['PassengerId', 'Name', 'GroupAge', 'SibSp', 'Parch', 'Ticket', 'GroupFare', 'Cabin'], axis = 1, inplace =True)
In [110]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X=train[['Pclass', 'Appellation', 'Sex', 'Age', 'FamilySize', 'GroupTicket', 'Fare', 'GroupCabin', 'Embarked']]
y=train['Survived']
#随机划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#逻辑回归模型初始化
lg = LogisticRegression()
#训练逻辑回归模型
lg.fit(X_train, y_train)
#用测试数据检验模型好坏
lg.score(X_test, y_test)
Out[110]:
In [111]:
from sklearn.tree import DecisionTreeClassifier
#树的最大深度为15, 内部节点再划分所需最小样本数为2, 叶节点最小样本数1, 最大叶子节点数10, 每次分类的最大特征数6
dt = DecisionTreeClassifier(max_depth=15, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=10, max_features=6)
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
Out[111]:
In [126]:
#支持向量机SVM
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
# 构造一个便于交叉验证模型性能的函数(模块)
def evaluate_cross_validation(clf, X, y, K):
# KFold 函数需要如下参数:数据量, 叉验次数, 是否洗牌
cv = KFold(len(y), K, shuffle=True, random_state = 0)
# 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
scores = cross_val_score(clf, X, y, cv=cv)
print (scores)
print ('Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores)))
# 使用线性核的SVC (后面会说到不同的核,结果可能大不相同)
svc_linear = SVC(kernel='rbf')#‘linear’:线性核函数‘poly’:多项式核函数‘rbf’:径像核函数/高斯核‘sigmod’:sigmod核函数‘precomputed’:核矩阵
# 五折交叉验证 K = 5
evaluate_cross_validation(svc_linear, X_train, y_train, 5)
In [118]:
#线性分类器
from sklearn.linear_model import SGDClassifier
# 选择使用SGD分类器,适合大规模数据,随机梯度下降方法估计参数
clf = SGDClassifier()
clf.fit(X_train, y_train)
# 导入评价包
from sklearn import metrics
y_train_predict = clf.predict(X_train)
# 内测,使用训练样本进行准确性能评估
print(metrics.accuracy_score(y_train, y_train_predict))
# 标准外测,使用测试样本进行准确性能评估
y_predict = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_predict))
In [123]:
#朴素贝叶斯分类器
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
y_predict =clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_predict))