泰坦尼克号项目

import pandas as pd
df_train,df_test = pd.read_csv("F:/Python CODE/Kaggle_Titanic/train.csv"),pd.read_csv("F:/Python CODE/Kaggle_Titanic/test.csv")
In [2]:
df_train.head()#查看表格的后5行
Out[2]:
 PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
 

SibSp -- 同船配偶以及兄弟姐妹的人数

Parch -- 同船父母或者子女的人数

Ticket -- 船票

Fare -- 票价

Cabin -- 舱位

Embarked -- 登船港口

In [3]:
df_train.info()  #查看数据表的整体信息
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
In [4]:
df_train.describe() #描述性统计
Out[4]:
 PassengerIdSurvivedPclassAgeSibSpParchFare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [5]:
df_train[["Name","Sex","Ticket","Cabin","Embarked"]].describe()#对于object类型(python对象)同样用describe()处理
Out[5]:
 NameSexTicketCabinEmbarked
count 891 891 891 204 889
unique 891 2 681 147 3
top Green, Mr. George Henry male CA. 2343 G6 S
freq 1 577 7 4 644
In [6]:
#特征分析,在11个特征中,找哪些是和幸存相关
import numpy as np
import matplotlib.pyplot as plt
Pclass_Survied = pd.crosstab(df_train['Pclass'],df_train['Survived'])#生成Pclass_Survied的列联表
In [7]:
Pclass_Survied
Out[7]:
Survived01
Pclass  
1 80 136
2 97 87
3 372 119
In [8]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆积柱形图
plt.show()
 
In [9]:
Pclass_Survied.count()
Out[9]:
Survived
0    3
1    3
dtype: int64
In [10]:
Pclass_Survied.index
Out[10]:
Int64Index([1, 2, 3], dtype='int64', name='Pclass')
In [11]:
Survied_len = len(Pclass_Survied.count())
Pclass_index = np.arange(len(Pclass_Survied.index))
In [12]:
Pclass_index
Out[12]:
array([0, 1, 2])
In [13]:
Pclass_Survied
Out[13]:
Survived01
Pclass  
1 80 136
2 97 87
3 372 119
In [14]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆积柱形图
Sum1 = 0
for i in range(Survied_len):
    SurvivedName = Pclass_Survied.columns[i]
    PclassCount = Pclass_Survied[SurvivedName]
    Sum1,Sum2 = Sum1+PclassCount,Sum1
    Zsum =Sum2+(Sum1 - Sum2)/2
    for x,y,z in zip(Pclass_index,PclassCount,Zsum):
        plt.text(x,z, '%.0f'%y, ha = 'center',va='center' )#添加数据标签
#修改x轴标签
plt.xticks(Pclass_Survied.index-1, Pclass_Survied.index, rotation=360)
plt.title('Survived status by pclass')
plt.show()
 
In [15]:
a = df_train.Pclass[df_train['Survived']==0].value_counts()
b = df_train.Pclass[df_train['Survived']==1].value_counts()
Pclass_Survived = pd.DataFrame({ 0: a, 1: b})
In [16]:
Pclass_Survived
Out[16]:
 01
1 80 136
2 97 87
3 372 119
In [17]:
import re
df_train['Appellation'] = df_train.Name.apply(lambda x: re.search('\w+\.', x).group()).str.replace('.', '')
df_train.Appellation.unique()
Out[17]:
array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)
In [18]:
Application_Sex = pd.crosstab(df_train.Sex,df_train.Appellation)
Application_Sex
Out[18]:
AppellationCaptColCountessDonDrJonkheerLadyMajorMasterMissMlleMmeMrMrsMsRevSir
Sex                 
female 0 0 1 0 1 0 1 0 0 182 2 1 0 125 1 0 0
male 1 2 0 1 6 1 0 2 40 0 0 0 517 0 0 6 1
In [19]:
df_train['Appellation'] = df_train['Appellation'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare')
df_train['Appellation'] = df_train['Appellation'].replace(['Mlle','Ms'], 'Miss')
df_train['Appellation'] = df_train['Appellation'].replace('Mme', 'Mrs')
df_train.Appellation.unique()
Out[19]:
array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], dtype=object)
In [44]:
Appellation_Survived = pd.crosstab(df_train['Appellation'], df_train['Survived'])
Appellation_Survived.plot(kind = 'bar')
plt.xticks(np.arange(len(Appellation_Survived.index)), Appellation_Survived.index, rotation = 360)
plt.title('Survived status by Appellation')
plt.show()
 
 
 
 
 
 
 
In [24]:
Sex_Survived = pd.crosstab(df_train['Sex'],df_train['Survived'])
In [45]:
#生成列联表
Sex_Survived = pd.crosstab(df_train['Sex'], df_train['Survived'])
Survived_len = len(Sex_Survived.count())
Sex_index = np.arange(len(Sex_Survived.index))
single_width = 0.35
for i in range(Survived_len):
    SurvivedName = Sex_Survived.columns[i]
    SexCount = Sex_Survived[SurvivedName]
    SexLocation = Sex_index * 1.05 + (i - 1/2)*single_width
   #绘制柱形图
    plt.bar(SexLocation, SexCount, width = single_width)
    for x, y in zip(SexLocation, SexCount):
        #添加数据标签
        plt.text(x, y, '%.0f'%y, ha='center', va='bottom')
index = Sex_index * 1.05 
plt.xticks(index, Sex_Survived.index, rotation=360)
plt.title('Survived status by sex')
plt.show()
 
In [46]:
SibSp_Survived = pd.crosstab(df_train['SibSp'], df_train['Survived'])
SibSp_Survived.plot(kind = 'bar')
plt.xticks(SibSp_Survived.index,SibSp_Survived.index,rotation = 360)
plt.title('Survived status by SibSp')
plt.show()
 
In [47]:
SibSp_Survived = pd.crosstab(df_train.SibSp[df_train['SibSp']>2], df_train['Survived'])
SibSp_Survived.plot(kind = 'bar')
plt.xticks([0,1,2,3],SibSp_Survived.index,rotation = 360)
plt.title('Survived status by SibSp')
plt.show()
 
In [28]:
Ticket_Count =  df_train.groupby('Ticket',as_index=False)['PassengerId'].count()
In [29]:
Ticket_Count.head()
Out[29]:
 TicketPassengerId
0 110152 3
1 110413 3
2 110465 2
3 110564 1
4 110813 1
In [30]:
#解释上行代码中的groupg中的as_index=False
df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'], 'price': [12,12,12,15,15,17]})
print(df)
print("*********************")
print (df.groupby('books', as_index=True).sum())
print("*********************")
print (df.groupby('books', as_index=False).sum())
 
  books  price
0   bk1     12
1   bk1     12
2   bk1     12
3   bk2     15
4   bk2     15
5   bk3     17
*********************
       price
books       
bk1       36
bk2       30
bk3       17
*********************
  books  price
0   bk1     36
1   bk2     30
2   bk3     17
In [31]:
Ticket_Count_0 = Ticket_Count[Ticket_Count.PassengerId == 1]['Ticket']
In [32]:
Ticket_Count_0.head()
Out[32]:
3    110564
4    110813
5    111240
6    111320
8    111369
Name: Ticket, dtype: object
In [33]:
df_train['GroupTicket'] = np.where(df_train.Ticket.isin(Ticket_Count_0),0,1)
In [34]:
GroupTicket_Survived = pd.crosstab(df_train['GroupTicket'],df_train['Survived'])
GroupTicket_Survived.plot(kind='bar')
plt.xticks(rotation =360)
Out[34]:
(array([0, 1]), <a list of 2 Text xticklabel objects>)
In [35]:
bins = [0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600]
df_train['GroupFare'] = pd.cut(df_train.Fare,bins,right=False)
GroupFare_Survived = pd.crosstab(df_train['GroupFare'],df_train['Survived'])
GroupFare_Survived.plot(kind = 'bar')
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0xac47eb8>
In [36]:
GroupFare_Survived.iloc[2:].plot(kind = 'bar')
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0xa7a4ef0>
In [ ]:
#以上所有操作都是对特征中无缺失部分进行分析
#下一步则会在特征工程中对缺失部分进行处理Age、Cabin、Embarked
In [37]:
df_train['Embarked'].mode()
Out[37]:
0    S
dtype: object
In [38]:
#df_train['Embarked'].mode()[0]  众数可能有多个,[0]代表取第一个
train = df_train.copy()
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
In [39]:
train['Cabin'] = train['Cabin'].fillna('NO')
In [40]:
Age_Appellation_median = train.groupby('Appellation')['Age'].median()
In [52]:
Age_Appellation_median
Out[52]:
Appellation
Master     3.5
Miss      21.0
Mr        30.0
Mrs       35.0
Rare      48.5
Name: Age, dtype: float64
In [59]:
train.set_index('Appellation', inplace = True)
#在当前表填充缺失值
train.Age.fillna(Age_Appellation_median, inplace = True)
#重置索引
train.reset_index(inplace = True)
In [60]:
train
Out[60]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFare
0 Mr 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NO S 0 [0, 60)
1 Mrs 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0 [60, 120)
2 Miss 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NO S 0 [0, 60)
3 Mrs 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 1 [0, 60)
4 Mr 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NO S 0 [0, 60)
5 Mr 6 0 3 Moran, Mr. James male 30.0 0 0 330877 8.4583 NO Q 0 [0, 60)
6 Mr 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 0 [0, 60)
7 Master 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NO S 1 [0, 60)
8 Mrs 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NO S 1 [0, 60)
9 Mrs 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NO C 1 [0, 60)
10 Miss 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S 1 [0, 60)
11 Miss 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S 0 [0, 60)
12 Mr 13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 NO S 0 [0, 60)
13 Mr 14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 NO S 1 [0, 60)
14 Miss 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 NO S 0 [0, 60)
15 Mrs 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NO S 0 [0, 60)
16 Master 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 NO Q 1 [0, 60)
17 Mr 18 1 2 Williams, Mr. Charles Eugene male 30.0 0 0 244373 13.0000 NO S 0 [0, 60)
18 Mrs 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 NO S 0 [0, 60)
19 Mrs 20 1 3 Masselmani, Mrs. Fatima female 35.0 0 0 2649 7.2250 NO C 0 [0, 60)
20 Mr 21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 NO S 1 [0, 60)
21 Mr 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S 0 [0, 60)
22 Miss 23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 NO Q 0 [0, 60)
23 Mr 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S 0 [0, 60)
24 Miss 25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 NO S 1 [0, 60)
25 Mrs 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 NO S 1 [0, 60)
26 Mr 27 0 3 Emir, Mr. Farred Chehab male 30.0 0 0 2631 7.2250 NO C 0 [0, 60)
27 Mr 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S 1 [240, 300)
28 Miss 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female 21.0 0 0 330959 7.8792 NO Q 0 [0, 60)
29 Mr 30 0 3 Todoroff, Mr. Lalio male 30.0 0 0 349216 7.8958 NO S 0 [0, 60)
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
861 Mr 862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 28134 11.5000 NO S 0 [0, 60)
862 Mrs 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S 0 [0, 60)
863 Miss 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female 21.0 8 2 CA. 2343 69.5500 NO S 1 [60, 120)
864 Mr 865 0 2 Gill, Mr. John William male 24.0 0 0 233866 13.0000 NO S 0 [0, 60)
865 Mrs 866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 236852 13.0000 NO S 0 [0, 60)
866 Miss 867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 SC/PARIS 2149 13.8583 NO C 0 [0, 60)
867 Mr 868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 PC 17590 50.4958 A24 S 0 [0, 60)
868 Mr 869 0 3 van Melkebeke, Mr. Philemon male 30.0 0 0 345777 9.5000 NO S 0 [0, 60)
869 Master 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 NO S 1 [0, 60)
870 Mr 871 0 3 Balkic, Mr. Cerin male 26.0 0 0 349248 7.8958 NO S 0 [0, 60)
871 Mrs 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S 1 [0, 60)
872 Mr 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S 0 [0, 60)
873 Mr 874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 345765 9.0000 NO S 0 [0, 60)
874 Mrs 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 NO C 1 [0, 60)
875 Miss 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 2667 7.2250 NO C 0 [0, 60)
876 Mr 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 NO S 1 [0, 60)
877 Mr 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 NO S 0 [0, 60)
878 Mr 879 0 3 Laleff, Mr. Kristo male 30.0 0 0 349217 7.8958 NO S 0 [0, 60)
879 Mrs 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C 1 [60, 120)
880 Mrs 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 NO S 1 [0, 60)
881 Mr 882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 NO S 0 [0, 60)
882 Miss 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 NO S 0 [0, 60)
883 Mr 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NO S 0 [0, 60)
884 Mr 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NO S 0 [0, 60)
885 Mrs 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NO Q 1 [0, 60)
886 Rare 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NO S 0 [0, 60)
887 Miss 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 0 [0, 60)
888 Miss 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 21.0 1 2 W./C. 6607 23.4500 NO S 1 [0, 60)
889 Mr 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 0 [0, 60)
890 Mr 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NO Q 0 [0, 60)

891 rows × 15 columns

In [62]:
train.Age.isnull().sum()
Out[62]:
0
In [64]:
train.Age.isnull().any()
Out[64]:
False
In [65]:
train.Age.describe()
Out[65]:
count    891.000000
mean      29.392447
std       13.268389
min        0.420000
25%       21.000000
50%       30.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64
In [66]:
Embarked_Survived = pd.crosstab(train['Embarked'],train['Survived'])
In [68]:
Embarked_Survived.plot(kind = 'bar')
plt.xticks(rotation = 360)
plt.title('Survived status by Embarked') 
plt.show()
 
In [69]:
train
Out[69]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFare
0 Mr 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NO S 0 [0, 60)
1 Mrs 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0 [60, 120)
2 Miss 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NO S 0 [0, 60)
3 Mrs 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 1 [0, 60)
4 Mr 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NO S 0 [0, 60)
5 Mr 6 0 3 Moran, Mr. James male 30.0 0 0 330877 8.4583 NO Q 0 [0, 60)
6 Mr 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 0 [0, 60)
7 Master 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NO S 1 [0, 60)
8 Mrs 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NO S 1 [0, 60)
9 Mrs 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NO C 1 [0, 60)
10 Miss 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S 1 [0, 60)
11 Miss 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S 0 [0, 60)
12 Mr 13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 NO S 0 [0, 60)
13 Mr 14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 NO S 1 [0, 60)
14 Miss 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 NO S 0 [0, 60)
15 Mrs 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NO S 0 [0, 60)
16 Master 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 NO Q 1 [0, 60)
17 Mr 18 1 2 Williams, Mr. Charles Eugene male 30.0 0 0 244373 13.0000 NO S 0 [0, 60)
18 Mrs 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 NO S 0 [0, 60)
19 Mrs 20 1 3 Masselmani, Mrs. Fatima female 35.0 0 0 2649 7.2250 NO C 0 [0, 60)
20 Mr 21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 NO S 1 [0, 60)
21 Mr 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S 0 [0, 60)
22 Miss 23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 NO Q 0 [0, 60)
23 Mr 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S 0 [0, 60)
24 Miss 25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 NO S 1 [0, 60)
25 Mrs 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 NO S 1 [0, 60)
26 Mr 27 0 3 Emir, Mr. Farred Chehab male 30.0 0 0 2631 7.2250 NO C 0 [0, 60)
27 Mr 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S 1 [240, 300)
28 Miss 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female 21.0 0 0 330959 7.8792 NO Q 0 [0, 60)
29 Mr 30 0 3 Todoroff, Mr. Lalio male 30.0 0 0 349216 7.8958 NO S 0 [0, 60)
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
861 Mr 862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 28134 11.5000 NO S 0 [0, 60)
862 Mrs 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S 0 [0, 60)
863 Miss 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female 21.0 8 2 CA. 2343 69.5500 NO S 1 [60, 120)
864 Mr 865 0 2 Gill, Mr. John William male 24.0 0 0 233866 13.0000 NO S 0 [0, 60)
865 Mrs 866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 236852 13.0000 NO S 0 [0, 60)
866 Miss 867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 SC/PARIS 2149 13.8583 NO C 0 [0, 60)
867 Mr 868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 PC 17590 50.4958 A24 S 0 [0, 60)
868 Mr 869 0 3 van Melkebeke, Mr. Philemon male 30.0 0 0 345777 9.5000 NO S 0 [0, 60)
869 Master 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 NO S 1 [0, 60)
870 Mr 871 0 3 Balkic, Mr. Cerin male 26.0 0 0 349248 7.8958 NO S 0 [0, 60)
871 Mrs 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S 1 [0, 60)
872 Mr 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S 0 [0, 60)
873 Mr 874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 345765 9.0000 NO S 0 [0, 60)
874 Mrs 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 NO C 1 [0, 60)
875 Miss 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 2667 7.2250 NO C 0 [0, 60)
876 Mr 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 NO S 1 [0, 60)
877 Mr 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 NO S 0 [0, 60)
878 Mr 879 0 3 Laleff, Mr. Kristo male 30.0 0 0 349217 7.8958 NO S 0 [0, 60)
879 Mrs 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C 1 [60, 120)
880 Mrs 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 NO S 1 [0, 60)
881 Mr 882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 NO S 0 [0, 60)
882 Miss 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 NO S 0 [0, 60)
883 Mr 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NO S 0 [0, 60)
884 Mr 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NO S 0 [0, 60)
885 Mrs 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NO Q 1 [0, 60)
886 Rare 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NO S 0 [0, 60)
887 Miss 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 0 [0, 60)
888 Miss 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 21.0 1 2 W./C. 6607 23.4500 NO S 1 [0, 60)
889 Mr 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 0 [0, 60)
890 Mr 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NO Q 0 [0, 60)

891 rows × 15 columns

In [80]:
train['GroupCabin'] = np.where(train['Cabin'] == 'NO',0,1)
In [82]:
GroupCabin_Survived = pd.crosstab(train['GroupCabin'],train['Survived'])
GroupCabin_Survived.plot(kind = 'bar')
plt.title('Survived status by GroupCabin')
plt.xticks(rotation=360)
plt.show()
 
In [86]:
#对Age进行分组: 2**10>891分成10组, 组距为(最大值80-最小值0)/10 =8取9
bins = [0, 9, 18, 27, 36, 45, 54, 63, 72, 81, 90]
train['GroupAge'] = pd.cut(train.Age, bins)
GroupAge_Survived = pd.crosstab(train['GroupAge'], train['Survived'])
GroupAge_Survived.plot(kind = 'bar')
plt.title('Survived status by GroupAge')
plt.show()
 
In [87]:
train['Appellation'] = train.Appellation.map({'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Rare': 4})
train.Appellation.unique()
Out[87]:
array([0, 1, 2, 3, 4], dtype=int64)
In [89]:
train['Sex'] = train.Sex.map({'female':0,'male':1})
In [90]:
train.head()
Out[90]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFareGroupCabinGroupAge
0 0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 NO S 0 [0, 60) 0 (18, 27]
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 C 0 [60, 120) 1 (36, 45]
2 2 3 1 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 NO S 0 [0, 60) 0 (18, 27]
3 1 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 113803 53.1000 C123 S 1 [0, 60) 1 (27, 36]
4 0 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 NO S 0 [0, 60) 0 (27, 36]
In [95]:
train.loc[train['Age'] < 9, 'Age']=0
train.loc[(train['Age'] >= 9) & (train['Age'] < 18), 'Age'] = 1
train.loc[(train['Age'] >= 18) & (train['Age'] < 27), 'Age'] = 2
train.loc[(train['Age'] >= 27) & (train['Age'] < 36), 'Age'] = 3
train.loc[(train['Age'] >= 36) & (train['Age'] < 45), 'Age'] = 4
train.loc[(train['Age'] >= 45) & (train['Age'] < 54), 'Age'] = 5
train.loc[(train['Age'] >= 54) & (train['Age'] < 63), 'Age'] = 6
train.loc[(train['Age'] >= 63) & (train['Age'] < 72), 'Age'] = 7
train.loc[(train['Age'] >= 72) & (train['Age'] < 81), 'Age'] = 8
train.loc[(train['Age'] >= 81) & (train['Age'] < 90), 'Age'] = 9
train.Age.unique()
Out[95]:
array([ 2.,  4.,  3.,  6.,  0.,  1.,  7.,  5.,  8.])
In [96]:
train.head()
Out[96]:
 AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFareGroupCabinGroupAge
0 0 1 0 3 Braund, Mr. Owen Harris 1 2.0 1 0 A/5 21171 7.2500 NO S 0 [0, 60) 0 (18, 27]
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 4.0 1 0 PC 17599 71.2833 C85 C 0 [60, 120) 1 (36, 45]
2 2 3 1 3 Heikkinen, Miss. Laina 0 2.0 0 0 STON/O2. 3101282 7.9250 NO S 0 [0, 60) 0 (18, 27]
3 1 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 3.0 1 0 113803 53.1000 C123 S 1 [0, 60) 1 (27, 36]
4 0 5 0 3 Allen, Mr. William Henry 1 3.0 0 0 373450 8.0500 NO S 0 [0, 60) 0 (27, 36]
In [97]:
#当SibSp和Parch都为0时, 则孤身一人.
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train.FamilySize.unique()
Out[97]:
array([ 2,  1,  5,  3,  7,  6,  4,  8, 11], dtype=int64)
In [98]:
train.loc[train['Fare'] < 60, 'Fare'] = 0
train.loc[(train['Fare'] >= 60) & (train['Fare'] < 120), 'Fare'] = 1
train.loc[(train['Fare'] >= 120) & (train['Fare'] < 180), 'Fare'] = 2
train.loc[(train['Fare'] >= 180) & (train['Fare'] < 240), 'Fare'] = 3
train.loc[(train['Fare'] >= 240) & (train['Fare'] < 300), 'Fare'] = 4
train.loc[(train['Fare'] >= 300) & (train['Fare'] < 360), 'Fare'] = 5
train.loc[(train['Fare'] >= 360) & (train['Fare'] < 420), 'Fare'] = 6
train.loc[(train['Fare'] >= 420) & (train['Fare'] < 480), 'Fare'] = 7
train.loc[(train['Fare'] >= 480) & (train['Fare'] < 540), 'Fare'] = 8
train.loc[(train['Fare'] >= 540) & (train['Fare'] < 600), 'Fare'] = 9
train.Fare.unique()
Out[98]:
array([ 0.,  1.,  4.,  2.,  8.,  3.])
In [99]:
train['Embarked'] = train.Embarked.map({'S': 0, 'C': 1, 'Q': 2})
In [100]:
train.drop(['PassengerId', 'Name', 'GroupAge', 'SibSp', 'Parch', 'Ticket', 'GroupFare', 'Cabin'], axis = 1, inplace =True)
In [110]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X=train[['Pclass', 'Appellation', 'Sex', 'Age', 'FamilySize', 'GroupTicket', 'Fare', 'GroupCabin', 'Embarked']]
y=train['Survived']
#随机划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#逻辑回归模型初始化
lg = LogisticRegression()
#训练逻辑回归模型
lg.fit(X_train, y_train)
#用测试数据检验模型好坏
lg.score(X_test, y_test)
Out[110]:
0.78212290502793291
In [111]:
from sklearn.tree import DecisionTreeClassifier
#树的最大深度为15, 内部节点再划分所需最小样本数为2, 叶节点最小样本数1, 最大叶子节点数10, 每次分类的最大特征数6
dt = DecisionTreeClassifier(max_depth=15, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=10, max_features=6)
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
Out[111]:
0.79329608938547491
In [126]:
#支持向量机SVM
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

# 构造一个便于交叉验证模型性能的函数(模块)
def evaluate_cross_validation(clf, X, y, K):
    # KFold 函数需要如下参数:数据量, 叉验次数, 是否洗牌
    cv = KFold(len(y), K, shuffle=True, random_state = 0)
    # 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
    scores = cross_val_score(clf, X, y, cv=cv)
    print (scores)
    print ('Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores)))
    
# 使用线性核的SVC (后面会说到不同的核,结果可能大不相同)
svc_linear = SVC(kernel='rbf')#‘linear’:线性核函数‘poly’:多项式核函数‘rbf’:径像核函数/高斯核‘sigmod’:sigmod核函数‘precomputed’:核矩阵
# 五折交叉验证 K = 5
evaluate_cross_validation(svc_linear, X_train, y_train, 5)
 
[ 0.82517483  0.86013986  0.80985915  0.83802817  0.87323944]
Mean score: 0.841 (+/-0.011)
In [118]:
#线性分类器
from sklearn.linear_model import SGDClassifier
# 选择使用SGD分类器,适合大规模数据,随机梯度下降方法估计参数
clf = SGDClassifier()
clf.fit(X_train, y_train)
# 导入评价包
from sklearn import metrics
y_train_predict = clf.predict(X_train)
# 内测,使用训练样本进行准确性能评估
print(metrics.accuracy_score(y_train, y_train_predict))
# 标准外测,使用测试样本进行准确性能评估
y_predict = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_predict))
 
0.651685393258
0.659217877095
In [123]:
#朴素贝叶斯分类器
from sklearn.naive_bayes import GaussianNB 
clf = GaussianNB()
clf.fit(X_train, y_train)
y_predict =clf.predict(X_test)
from sklearn.metrics import accuracy_score 
print(accuracy_score(y_test, y_predict))
 
0.765363128492
 
posted @ 2018-11-26 10:05  USTC丶ZCC  阅读(520)  评论(0编辑  收藏  举报