第二章:数据清洗与可视化

# 1.数据清洗
import numpy as np
import pandas as pd
df=pd.read_csv('train.csv')
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df.info()#查看train.csv中的基本信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.isnull().sum()#查看train.csv中的缺失值数量
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
df.isnull()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 False False False False False False False False False False True False
1 False False False False False False False False False False False False
2 False False False False False False False False False False True False
3 False False False False False False False False False False False False
4 False False False False False False False False False False True False
... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False True False
887 False False False False False False False False False False False False
888 False False False False False True False False False False True False
889 False False False False False False False False False False False False
890 False False False False False False False False False False True False

891 rows × 12 columns

df[['Age','Cabin','Embarked']]#查看缺失数据的列
Age Cabin Embarked
0 22.0 NaN S
1 38.0 C85 C
2 26.0 NaN S
3 35.0 C123 S
4 35.0 NaN S
... ... ... ...
886 27.0 NaN S
887 19.0 B42 S
888 NaN NaN S
889 26.0 C148 C
890 32.0 NaN Q

891 rows × 3 columns

df1=df.fillna({'Age':0})#用0填充Age中缺失的数据,生成的是副本
df1
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df.loc[df['Age'].isnull()]#查找Age为空的行
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
... ... ... ... ... ... ... ... ... ... ... ... ...
859 860 0 3 Razi, Mr. Raihed male NaN 0 0 2629 7.2292 NaN C
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 CA. 2343 69.5500 NaN S
868 869 0 3 van Melkebeke, Mr. Philemon male NaN 0 0 345777 9.5000 NaN S
878 879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S

177 rows × 12 columns

df.loc[df['Age'].isnull(),'Age']=0#将Age空的一行中的Age补为0
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df=df.fillna(0)#对整张表的缺失进行处理,将空数据都改为0
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 147 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 81 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 147 2
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 55 2
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 147 2
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 147 2
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 30 2
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 147 2
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 60 0
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 147 1

891 rows × 12 columns

df.duplicated()#查看数据中是否有重复数据,有的话为False
df[df.duplicated()]#将重复数据拿出来
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
a=pd.DataFrame({'A':['a','a','c','c'],
               'B':['a','a','c','c'],
               'C':['1','1','2','3'],})
a
A B C
0 a a 1
1 a a 1
2 c c 2
3 c c 3
a.drop_duplicates()#将重复数据删除
A B C
0 a a 1
2 c c 2
3 c c 3
df.drop_duplicates()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df.to_csv('train_clear.csv')
# 特征大概分为两大类:
# 数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,其中Survived, Pclass为离散型数值特征,Age,SibSp, Parch, Fare为连续型数值特征
# 文本型特征:Name, Sex, Cabin,Embarked, Ticket,其中Sex, Cabin, Embarked,Ticket为类别型文本特征,
# 数值型特征一般可以直接用于模型的训练,但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。文本型特征往往需要转换成数值型特征才能用于建模分析。
df['Age bins']=pd.cut(df['Age'],5,labels=list('12345'))#将Age平均分箱成5个年龄段,并分别用‘12345’表示,并命名为Age bins 存入df表中
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age bins
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 0 S 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 3
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 0 S 2
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 3
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 0 S 3
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 0 S 2
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 2
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 0 S 1
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 2
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 0 Q 2

891 rows × 13 columns

from matplotlib import pyplot as plt
plt.hist(df['Age'])#查看Age的直方图
(array([227.,  33., 164., 181., 123.,  74.,  50.,  26.,  11.,   2.]),
 array([ 0.,  8., 16., 24., 32., 40., 48., 56., 64., 72., 80.]),
 <BarContainer object of 10 artists>)

img

plt.hist(df['Age bins'])#查看Age bins的直方图
(array([346.,   0., 188.,   0.,   0., 277.,   0.,  69.,   0.,  11.]),
 array([0. , 0.4, 0.8, 1.2, 1.6, 2. , 2.4, 2.8, 3.2, 3.6, 4. ]),
 <BarContainer object of 10 artists>)

img

df['Age bins']=pd.cut(df['Age'],[0,5,15,30,50,80],right = False,labels=list('12345'))#right=False将区间改为左闭右开区间,不写默认为True即左开右闭区间
#按[0,5),[5,15),[15,30),[30,50),[50,80)区间分箱
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age bins
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 0 S 3
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 4
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 0 S 3
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 0 S 4
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 0 S 3
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 3
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 0 S 1
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 3
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 0 Q 4

891 rows × 13 columns

df['Age bins']=pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,0.9],duplicates='drop',labels=list('1234'))
#duplicates='drop'表示如果边缘重复的话用drop进行删除,默认为raise(上诉数据因为重复了一个所以标签减少一个)
#按数据的百分比进行分箱
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age bins
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 0 S 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 4
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 0 S 3
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 0 S 4
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 0 S 3
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 2
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 0 S 1
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 3
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 0 Q 3

891 rows × 13 columns

df.to_csv('train_bin.csv')
df['Sex'].unique()#查看文本变量名及种类
array(['male', 'female'], dtype=object)
df['Cabin'].unique()
array([0, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27',
       'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31',
       'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47',
       'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4',
       'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77',
       'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D',
       'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7',
       'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37',
       'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104',
       'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30',
       'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22',
       'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20',
       'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126',
       'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64',
       'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30', 'E121',
       'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36', 'B102',
       'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42', 'C148'],
      dtype=object)
df['Embarked'].unique()
array(['S', 'C', 'Q', 0], dtype=object)
df['Sex'].replace(['male', 'female'],[1,2],inplace=True)#用1,2替换掉Sex中的male和female,inplace=True表示替换掉母本
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age bins
0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 0 S 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 38.0 1 0 PC 17599 71.2833 C85 C 4
2 3 1 3 Heikkinen, Miss. Laina 2 26.0 0 0 STON/O2. 3101282 7.9250 0 S 3
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 2 35.0 1 0 113803 53.1000 C123 S 4
4 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 0 S 4
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas 1 27.0 0 0 211536 13.0000 0 S 3
887 888 1 1 Graham, Miss. Margaret Edith 2 19.0 0 0 112053 30.0000 B42 S 2
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" 2 0.0 1 2 W./C. 6607 23.4500 0 S 1
889 890 1 1 Behr, Mr. Karl Howell 1 26.0 0 0 111369 30.0000 C148 C 3
890 891 0 3 Dooley, Mr. Patrick 1 32.0 0 0 370376 7.7500 0 Q 3

891 rows × 13 columns

from sklearn.preprocessing import LabelEncoder
df['Cabin'] = LabelEncoder().fit_transform(df['Cabin'])#将文本变量通过sklearn库直接变为数字变量
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 147 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 81 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 147 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 55 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 147 S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 147 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 30 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 147 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 60 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 147 Q

891 rows × 12 columns

df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])#将文本变量通过sklearn库直接变为数字变量
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 147 2
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 81 0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 147 2
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 55 2
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 147 2
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 147 2
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 30 2
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 147 2
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 60 0
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 147 1

891 rows × 12 columns

for column in ['Cabin','Age','Embarked']:
    x=pd.get_dummies(df[column],prefix=column)#对df表中的Age,Cabin,Embarked进行one-hot编码转换
    df=pd.concat([df,x],axis=1)#将df和x按列拼接
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare ... Age_66.0 Age_70.0 Age_70.5 Age_71.0 Age_74.0 Age_80.0 Embarked_0 Embarked_1 Embarked_2 Embarked_3
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 ... False False False False False False False False True False
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 ... False False False False False False True False False False
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 ... False False False False False False False False True False
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 ... False False False False False False False False True False
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 ... False False False False False False False False True False

5 rows × 253 columns

df['Title']=df.Name.str.extract(r'([A-Za-z]+)\.')#df.Name表示拿出df表中的Name用str.extract进行正则表达式匹配,并用Title进行存储
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare ... Age_70.0 Age_70.5 Age_71.0 Age_74.0 Age_80.0 Embarked_0 Embarked_1 Embarked_2 Embarked_3 Title
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 ... False False False False False False False True False Mr
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 ... False False False False False True False False False Mrs
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 ... False False False False False False False True False Miss
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 ... False False False False False False False True False Mrs
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 ... False False False False False False False True False Mr
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 ... False False False False False False False True False Rev
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 ... False False False False False False False True False Miss
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 ... False False False False False False False True False Miss
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 ... False False False False False True False False False Mr
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 ... False False False False False False True False False Mr

891 rows × 254 columns

#2.数据重构


left_up=pd.read_csv('data/train-left-up.csv')
left_up
PassengerId Survived Pclass Name
0 1 0 3 Braund, Mr. Owen Harris
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 3 1 3 Heikkinen, Miss. Laina
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 5 0 3 Allen, Mr. William Henry
... ... ... ... ...
434 435 0 1 Silvey, Mr. William Baird
435 436 1 1 Carter, Miss. Lucile Polk
436 437 0 3 Ford, Miss. Doolina Margaret "Daisy"
437 438 1 2 Richards, Mrs. Sidney (Emily Hocking)
438 439 0 1 Fortune, Mr. Mark

439 rows × 4 columns

left_down=pd.read_csv('data/train-left-down.csv')
left_down
PassengerId Survived Pclass Name
0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson
1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield)
2 442 0 3 Hampe, Mr. Leon
3 443 0 3 Petterson, Mr. Johan Emil
4 444 1 2 Reynaldo, Ms. Encarnacion
... ... ... ... ...
447 887 0 2 Montvila, Rev. Juozas
448 888 1 1 Graham, Miss. Margaret Edith
449 889 0 3 Johnston, Miss. Catherine Helen "Carrie"
450 890 1 1 Behr, Mr. Karl Howell
451 891 0 3 Dooley, Mr. Patrick

452 rows × 4 columns

right_up=pd.read_csv('data/train-right-up.csv')
right_up
Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 male 22.0 1 0 A/5 21171 7.2500 NaN S
1 female 38.0 1 0 PC 17599 71.2833 C85 C
2 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 female 35.0 1 0 113803 53.1000 C123 S
4 male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ...
434 male 50.0 1 0 13507 55.9000 E44 S
435 female 14.0 1 2 113760 120.0000 B96 B98 S
436 female 21.0 2 2 W./C. 6608 34.3750 NaN S
437 female 24.0 2 3 29106 18.7500 NaN S
438 male 64.0 1 4 19950 263.0000 C23 C25 C27 S

439 rows × 8 columns

right_down=pd.read_csv('data/train-right-down.csv')
right_down
Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 male 31.0 0 0 C.A. 18723 10.500 NaN S
1 female 45.0 1 1 F.C.C. 13529 26.250 NaN S
2 male 20.0 0 0 345769 9.500 NaN S
3 male 25.0 1 0 347076 7.775 NaN S
4 female 28.0 0 0 230434 13.000 NaN S
... ... ... ... ... ... ... ... ...
447 male 27.0 0 0 211536 13.000 NaN S
448 female 19.0 0 0 112053 30.000 B42 S
449 female NaN 1 2 W./C. 6607 23.450 NaN S
450 male 26.0 0 0 111369 30.000 C148 C
451 male 32.0 0 0 370376 7.750 NaN Q

452 rows × 8 columns

# 用concat方法进行拼接
result_up=pd.concat([left_up,right_up],axis=1)#将[left_up,right_up]两个表横向合并成一张表
result_up
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
434 435 0 1 Silvey, Mr. William Baird male 50.0 1 0 13507 55.9000 E44 S
435 436 1 1 Carter, Miss. Lucile Polk female 14.0 1 2 113760 120.0000 B96 B98 S
436 437 0 3 Ford, Miss. Doolina Margaret "Daisy" female 21.0 2 2 W./C. 6608 34.3750 NaN S
437 438 1 2 Richards, Mrs. Sidney (Emily Hocking) female 24.0 2 3 29106 18.7500 NaN S
438 439 0 1 Fortune, Mr. Mark male 64.0 1 4 19950 263.0000 C23 C25 C27 S

439 rows × 12 columns

result_down=pd.concat([left_down,right_down],axis=1)#将[left_down,right_down]两个表横向合并成一张表
result_down
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson male 31.0 0 0 C.A. 18723 10.500 NaN S
1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) female 45.0 1 1 F.C.C. 13529 26.250 NaN S
2 442 0 3 Hampe, Mr. Leon male 20.0 0 0 345769 9.500 NaN S
3 443 0 3 Petterson, Mr. Johan Emil male 25.0 1 0 347076 7.775 NaN S
4 444 1 2 Reynaldo, Ms. Encarnacion female 28.0 0 0 230434 13.000 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
447 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.000 NaN S
448 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.000 B42 S
449 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.450 NaN S
450 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.000 C148 C
451 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.750 NaN Q

452 rows × 12 columns

result=pd.concat([result_up,result_down])#将[result_up,result_down]两个表纵向合并成一张表,默认axis=0
result=result.reset_index(drop=True)
result
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

# 用join方法进行拼接
up=left_up.join(right_up)
up
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
434 435 0 1 Silvey, Mr. William Baird male 50.0 1 0 13507 55.9000 E44 S
435 436 1 1 Carter, Miss. Lucile Polk female 14.0 1 2 113760 120.0000 B96 B98 S
436 437 0 3 Ford, Miss. Doolina Margaret "Daisy" female 21.0 2 2 W./C. 6608 34.3750 NaN S
437 438 1 2 Richards, Mrs. Sidney (Emily Hocking) female 24.0 2 3 29106 18.7500 NaN S
438 439 0 1 Fortune, Mr. Mark male 64.0 1 4 19950 263.0000 C23 C25 C27 S

439 rows × 12 columns

down=left_down.join(right_down)
down
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson male 31.0 0 0 C.A. 18723 10.500 NaN S
1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) female 45.0 1 1 F.C.C. 13529 26.250 NaN S
2 442 0 3 Hampe, Mr. Leon male 20.0 0 0 345769 9.500 NaN S
3 443 0 3 Petterson, Mr. Johan Emil male 25.0 1 0 347076 7.775 NaN S
4 444 1 2 Reynaldo, Ms. Encarnacion female 28.0 0 0 230434 13.000 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
447 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.000 NaN S
448 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.000 B42 S
449 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.450 NaN S
450 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.000 C148 C
451 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.750 NaN Q

452 rows × 12 columns

res = pd.concat([up,down])# 横向拼接用join,纵向拼接用concat
res=res.reset_index(drop=True)
res
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

# 用merge方法进行拼接
up=pd.merge(left_up,right_up,left_index=True,right_index=True)#left_index=True,right_index=True表示用行索引进行拼接 
up
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
434 435 0 1 Silvey, Mr. William Baird male 50.0 1 0 13507 55.9000 E44 S
435 436 1 1 Carter, Miss. Lucile Polk female 14.0 1 2 113760 120.0000 B96 B98 S
436 437 0 3 Ford, Miss. Doolina Margaret "Daisy" female 21.0 2 2 W./C. 6608 34.3750 NaN S
437 438 1 2 Richards, Mrs. Sidney (Emily Hocking) female 24.0 2 3 29106 18.7500 NaN S
438 439 0 1 Fortune, Mr. Mark male 64.0 1 4 19950 263.0000 C23 C25 C27 S

439 rows × 12 columns

down=pd.merge(left_down,right_down,left_index=True,right_index=True)#left_index=True,right_index=True表示用行索引进行拼接 
down
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson male 31.0 0 0 C.A. 18723 10.500 NaN S
1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) female 45.0 1 1 F.C.C. 13529 26.250 NaN S
2 442 0 3 Hampe, Mr. Leon male 20.0 0 0 345769 9.500 NaN S
3 443 0 3 Petterson, Mr. Johan Emil male 25.0 1 0 347076 7.775 NaN S
4 444 1 2 Reynaldo, Ms. Encarnacion female 28.0 0 0 230434 13.000 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
447 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.000 NaN S
448 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.000 B42 S
449 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.450 NaN S
450 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.000 C148 C
451 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.750 NaN Q

452 rows × 12 columns

res1 = pd.concat([up,down])# 横向拼接用join,纵向拼接用concat
res1=res1.reset_index(drop=True)
res1
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

result.to_csv('result.csv')
data=pd.read_csv('result.csv')
data
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 13 columns

data_unit=data.stack()#将数据改为Series类型数据
data_unit
0    Unnamed: 0                           0
     PassengerId                          1
     Survived                             0
     Pclass                               3
     Name           Braund, Mr. Owen Harris
                             ...           
890  SibSp                                0
     Parch                                0
     Ticket                          370376
     Fare                              7.75
     Embarked                             Q
Length: 10717, dtype: object
# 数据重构第二部分
df=pd.read_csv('result.csv')
df.head(2)
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
list(df.groupby('Sex'))#将男性和女性分成两组
[('female',
       Unnamed: 0  PassengerId  Survived  Pclass  \
  1             1            2         1       1   
  2             2            3         1       3   
  3             3            4         1       1   
  8             8            9         1       3   
  9             9           10         1       2   
  ..          ...          ...       ...     ...   
  880         880          881         1       2   
  882         882          883         0       3   
  885         885          886         0       3   
  887         887          888         1       1   
  888         888          889         0       3   
  
                                                    Name     Sex   Age  SibSp  \
  1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
  2                               Heikkinen, Miss. Laina  female  26.0      0   
  3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
  8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
  9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
  ..                                                 ...     ...   ...    ...   
  880       Shelley, Mrs. William (Imanita Parrish Hall)  female  25.0      0   
  882                       Dahlberg, Miss. Gerda Ulrika  female  22.0      0   
  885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
  887                       Graham, Miss. Margaret Edith  female  19.0      0   
  888           Johnston, Miss. Catherine Helen "Carrie"  female   NaN      1   
  
       Parch            Ticket     Fare Cabin Embarked  
  1        0          PC 17599  71.2833   C85        C  
  2        0  STON/O2. 3101282   7.9250   NaN        S  
  3        0            113803  53.1000  C123        S  
  8        2            347742  11.1333   NaN        S  
  9        0            237736  30.0708   NaN        C  
  ..     ...               ...      ...   ...      ...  
  880      1            230433  26.0000   NaN        S  
  882      0              7552  10.5167   NaN        S  
  885      5            382652  29.1250   NaN        Q  
  887      0            112053  30.0000   B42        S  
  888      2        W./C. 6607  23.4500   NaN        S  
  
  [314 rows x 13 columns]),
 ('male',
       Unnamed: 0  PassengerId  Survived  Pclass  \
  0             0            1         0       3   
  4             4            5         0       3   
  5             5            6         0       3   
  6             6            7         0       1   
  7             7            8         0       3   
  ..          ...          ...       ...     ...   
  883         883          884         0       2   
  884         884          885         0       3   
  886         886          887         0       2   
  889         889          890         1       1   
  890         890          891         0       3   
  
                                 Name   Sex   Age  SibSp  Parch  \
  0           Braund, Mr. Owen Harris  male  22.0      1      0   
  4          Allen, Mr. William Henry  male  35.0      0      0   
  5                  Moran, Mr. James  male   NaN      0      0   
  6           McCarthy, Mr. Timothy J  male  54.0      0      0   
  7    Palsson, Master. Gosta Leonard  male   2.0      3      1   
  ..                              ...   ...   ...    ...    ...   
  883   Banfield, Mr. Frederick James  male  28.0      0      0   
  884          Sutehall, Mr. Henry Jr  male  25.0      0      0   
  886           Montvila, Rev. Juozas  male  27.0      0      0   
  889           Behr, Mr. Karl Howell  male  26.0      0      0   
  890             Dooley, Mr. Patrick  male  32.0      0      0   
  
                 Ticket     Fare Cabin Embarked  
  0           A/5 21171   7.2500   NaN        S  
  4              373450   8.0500   NaN        S  
  5              330877   8.4583   NaN        Q  
  6               17463  51.8625   E46        S  
  7              349909  21.0750   NaN        S  
  ..                ...      ...   ...      ...  
  883  C.A./SOTON 34068  10.5000   NaN        S  
  884   SOTON/OQ 392076   7.0500   NaN        S  
  886            211536  13.0000   NaN        S  
  889            111369  30.0000  C148        C  
  890            370376   7.7500   NaN        Q  
  
  [577 rows x 13 columns])]
df.groupby('Sex').describe()#查看女性和男性的不同信息
Unnamed: 0 PassengerId ... Parch Fare
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
Sex
female 314.0 430.028662 256.846324 1.0 230.75 413.5 640.25 888.0 314.0 431.028662 ... 1.0 6.0 314.0 44.479818 57.997698 6.75 12.071875 23.0 55.00 512.3292
male 577.0 453.147314 257.486139 0.0 221.00 463.0 679.00 890.0 577.0 454.147314 ... 0.0 5.0 577.0 25.523893 43.138263 0.00 7.895800 10.5 26.55 512.3292

2 rows × 64 columns

df.groupby('Sex')['Age'].describe()#只查看年龄的相关信息
count mean std min 25% 50% 75% max
Sex
female 261.0 27.915709 14.110146 0.75 18.0 27.0 37.0 63.0
male 453.0 30.726645 14.678201 0.42 21.0 29.0 39.0 80.0
df.groupby('Sex')['Age'].mean()#只差看年龄的平均值信息
Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64
mean_fare_sex=df.groupby('Sex')['Fare'].describe()#票价信息
mean_fare_sex
count mean std min 25% 50% 75% max
Sex
female 314.0 44.479818 57.997698 6.75 12.071875 23.0 55.00 512.3292
male 577.0 25.523893 43.138263 0.00 7.895800 10.5 26.55 512.3292
survived_sex=df.groupby('Sex')['Survived'].sum()#存活总人数
survived_sex
Sex
female    233
male      109
Name: Survived, dtype: int64
survived_Pclass=df.groupby('Pclass')['Survived'].sum()#1,2,3等船舱存活总人数
survived_Pclass
Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64
# 用agg方法进行上述任务
df.groupby('Sex').agg({'Survived':'sum','Fare':'mean'}).rename(columns={'Survived':'Survived_sum','Fare':'Fare_mean'})
#通过agg方法同时对两个任务进行求和
Survived_sum Fare_mean
Sex
female 233 44.479818
male 109 25.523893
df.groupby(['Pclass','Age'])['Fare'].mean()#通过'Pclass','Age'来计算'Fare'的平均值
Pclass  Age  
1       0.92     151.5500
        2.00     151.5500
        4.00      81.8583
        11.00    120.0000
        14.00    120.0000
                   ...   
3       61.00      6.2375
        63.00      9.5875
        65.00      7.7500
        70.50      7.7500
        74.00      7.7750
Name: Fare, Length: 182, dtype: float64
mean_fare_sex.index
Index(['female', 'male'], dtype='object', name='Sex')
survived_sex=survived_sex.to_frame()#将Series数据变为DataFrame类型数据
type(survived_sex)
pandas.core.frame.DataFrame
pd.merge(survived_sex,mean_fare_sex,on='Sex')#将survived_sex,mean_fare_sex进行拼接,并以'Sex'为表头
Survived count mean std min 25% 50% 75% max
Sex
female 233 314.0 44.479818 57.997698 6.75 12.071875 23.0 55.00 512.3292
male 109 577.0 25.523893 43.138263 0.00 7.895800 10.5 26.55 512.3292
survived_age=df.groupby(['Age'])['Survived'].sum()
survived_age
Age
0.42     1
0.67     1
0.75     2
0.83     2
0.92     1
        ..
70.00    0
70.50    0
71.00    0
74.00    0
80.00    1
Name: Survived, Length: 88, dtype: int64
max(survived_age)
15
survived_age[survived_age.values==max(survived_age)]#查找存活率最高的年龄
Age
24.0    15
Name: Survived, dtype: int64
rate=max(survived_age)/sum(df['Age'].values==24.0)#该年龄最大存活率
rate
0.5
f'最大存活率:{rate}'
'最大存活率:0.5'
#数据可视化
df=pd.read_csv('result.csv')
df.head(2)
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
sex=df.groupby('Sex')['Survived'].sum()
sex
Sex
female    233
male      109
Name: Survived, dtype: int64
sex.plot.bar()#画出女性存活数和男性存活数的柱状图,plot是Series自带的库
plt.title('survived')
Text(0.5, 1.0, 'survived')

img

df.groupby(['Sex','Survived'])['Survived'].count()
Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64
sex_survived=df.groupby(['Sex','Survived'])['Survived'].count().unstack()#分别计算男性和女性0和1(死亡和生存)的人数,用.unstack()进行反转
sex_survived
Survived 0 1
Sex
female 81 233
male 468 109
died=sex_survived[0]
died.plot.bar()
plt.title('died')
Text(0.5, 1.0, 'died')

img

sex_survived.plot.bar()
plt.title('survived and died')
Text(0.5, 1.0, 'survived and died')

img

sex_survived.plot(kind='bar',stacked='True')#stacked='True'可以将两个数据叠加在一块
<Axes: xlabel='Sex'>

fare=df.groupby(['Fare','Survived'])['Survived'].count().unstack()#分别计算男性和女性0和1(死亡和生存)的人数,用.unstack()进行反转
fare
Survived 0 1
Fare
0.0000 14.0 1.0
4.0125 1.0 NaN
5.0000 1.0 NaN
6.2375 1.0 NaN
6.4375 1.0 NaN
... ... ...
227.5250 1.0 3.0
247.5208 1.0 1.0
262.3750 NaN 2.0
263.0000 2.0 2.0
512.3292 NaN 3.0

248 rows × 2 columns

fare.plot()#默认为折线图
<Axes: xlabel='Fare'>

pclass=df.groupby(['Pclass','Survived'])['Survived'].count().unstack()#分别计算男性和女性0和1(死亡和生存)的人数,用.unstack()进行反转
pclass
Survived 0 1
Pclass
1 80 136
2 97 87
3 372 119
pclass.plot.bar()
<Axes: xlabel='Pclass'>

img

df.Age[df.Survived==0].hist(bins=5,alpha=0.5)#死亡乘客年龄的直方图
df.Age[df.Survived==1].hist(bins=5,alpha=0.5)#存活乘客年龄的直方图
# bins=5表示分成5份,alpha=0.5用来调整透明度
plt.legend([0,1])#蓝色为0,黄色为1
plt.xlabel('age')#横坐标标签
plt.ylabel('count')#纵坐标标签
Text(0, 0.5, 'count')

img

df.Age[df.Survived==0].hist(bins=5,alpha=0.5,density=1)#死亡乘客年龄的直方图
df.Age[df.Survived==1].hist(bins=5,alpha=0.5,density=1)#存活乘客年龄的直方图
# bins=5表示分成5份,alpha=0.5用来调整透明度,density=1表示把y轴改为密度
df.Age[df.Survived==0].plot.density()
df.Age[df.Survived==1].plot.density()#添加一个密度曲线
plt.legend([0,1])#蓝色为0,黄色为1
plt.xlabel('age')#横坐标标签
plt.ylabel('density')#纵坐标标签
Text(0, 0.5, 'density')

img

df.Age[df.Pclass==1].plot.density()#1,2,3等船舱与年龄之间的关系
<Axes: ylabel='Density'>

img

unique_pclass=df.Pclass.unique()
unique_pclass.sort()#顺序排序
unique_pclass
array([1, 2, 3], dtype=int64)
for i in unique_pclass:
    df.Age[df.Pclass==i].plot.density()
plt.xlabel('age')
plt.legend(unique_pclass)
<matplotlib.legend.Legend at 0x1e2ff3706d0>

img

import seaborn as sns#画图库
for i in unique_pclass:
    sns.kdeplot(df.Age[df.Pclass==i],shade=True,linewidth=0)#shade=True表示添加阴影,linewidth=0表示线宽度为0。

img


posted @   瑟兰迪尔·绿叶  阅读(76)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
点击右上角即可分享
微信分享提示