Python数据分析与机器学习-Pandas_3

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
print(titanic_survival.shape)
(891, 12)
# The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
# We can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)
print(age_is_null)
age_null_true = age[age_is_null]
print(age_null_true)
print(len(age_null_true))
0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64
0      False
1      False
2      False
3      False
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17      True
18     False
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29      True
       ...  
861    False
862    False
863     True
864    False
865    False
866    False
867    False
868     True
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878     True
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool
5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
29    NaN
31    NaN
32    NaN
36    NaN
42    NaN
45    NaN
46    NaN
47    NaN
48    NaN
55    NaN
64    NaN
65    NaN
76    NaN
77    NaN
82    NaN
87    NaN
95    NaN
101   NaN
107   NaN
109   NaN
121   NaN
126   NaN
128   NaN
140   NaN
154   NaN
       ..
718   NaN
727   NaN
732   NaN
738   NaN
739   NaN
740   NaN
760   NaN
766   NaN
768   NaN
773   NaN
776   NaN
778   NaN
783   NaN
790   NaN
792   NaN
793   NaN
815   NaN
825   NaN
826   NaN
828   NaN
832   NaN
837   NaN
839   NaN
846   NaN
849   NaN
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64
177
# The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age)
nan
# We have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null==False]
correct_mean_age = sum(good_ages)/len(good_ages)
print(correct_mean_age)
29.69911764705882
# Missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age)
29.69911764705882
# Mean fare for each class
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
# index tells the method which column to group by values is the column that we want to apply 
# the calculation to aggrunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived", aggfunc=np.mean)
print(passenger_survival)
        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
#specifying axis=1 or axis='columns' will drop any columns that have null values
print(titanic_survival.shape)
drop_na_columns = titanic_survival.dropna(axis=1)
print(drop_na_columns.shape)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
print(new_titanic_survival)
(891, 12)
(891, 9)
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
18            19         0       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25         0       3   
25            26         1       3   
27            28         0       1   
30            31         0       1   
33            34         0       2   
34            35         0       1   
35            36         0       1   
37            38         0       3   
38            39         0       3   
..           ...       ...     ...   
856          857         1       1   
857          858         1       1   
858          859         1       3   
860          861         0       3   
861          862         0       2   
862          863         1       1   
864          865         0       2   
865          866         1       2   
866          867         1       2   
867          868         0       1   
869          870         1       3   
870          871         0       3   
871          872         1       1   
872          873         0       1   
873          874         0       3   
874          875         1       2   
875          876         1       3   
876          877         0       3   
877          878         0       3   
879          880         1       1   
880          881         1       2   
881          882         0       3   
882          883         0       3   
883          884         0       2   
884          885         0       3   
885          886         0       3   
886          887         0       2   
887          888         1       1   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
6                              McCarthy, Mr. Timothy J    male  54.0      0   
7                       Palsson, Master. Gosta Leonard    male   2.0      3   
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
10                     Sandstrom, Miss. Marguerite Rut  female   4.0      1   
11                            Bonnell, Miss. Elizabeth  female  58.0      0   
12                      Saundercock, Mr. William Henry    male  20.0      0   
13                         Andersson, Mr. Anders Johan    male  39.0      1   
14                Vestrom, Miss. Hulda Amanda Adolfina  female  14.0      0   
15                    Hewlett, Mrs. (Mary D Kingcome)   female  55.0      0   
16                                Rice, Master. Eugene    male   2.0      4   
18   Vander Planke, Mrs. Julius (Emelia Maria Vande...  female  31.0      1   
20                                Fynney, Mr. Joseph J    male  35.0      0   
21                               Beesley, Mr. Lawrence    male  34.0      0   
22                         McGowan, Miss. Anna "Annie"  female  15.0      0   
23                        Sloper, Mr. William Thompson    male  28.0      0   
24                       Palsson, Miss. Torborg Danira  female   8.0      3   
25   Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...  female  38.0      1   
27                      Fortune, Mr. Charles Alexander    male  19.0      3   
30                            Uruchurtu, Don. Manuel E    male  40.0      0   
33                               Wheadon, Mr. Edward H    male  66.0      0   
34                             Meyer, Mr. Edgar Joseph    male  28.0      1   
35                      Holverson, Mr. Alexander Oskar    male  42.0      1   
37                            Cann, Mr. Ernest Charles    male  21.0      0   
38                  Vander Planke, Miss. Augusta Maria  female  18.0      2   
..                                                 ...     ...   ...    ...   
856         Wick, Mrs. George Dennick (Mary Hitchcock)  female  45.0      1   
857                             Daly, Mr. Peter Denis     male  51.0      0   
858              Baclini, Mrs. Solomon (Latifa Qurban)  female  24.0      0   
860                            Hansen, Mr. Claus Peter    male  41.0      2   
861                        Giles, Mr. Frederick Edward    male  21.0      1   
862  Swift, Mrs. Frederick Joel (Margaret Welles Ba...  female  48.0      0   
864                             Gill, Mr. John William    male  24.0      0   
865                           Bystrom, Mrs. (Karolina)  female  42.0      0   
866                       Duran y More, Miss. Asuncion  female  27.0      1   
867               Roebling, Mr. Washington Augustus II    male  31.0      0   
869                    Johnson, Master. Harold Theodor    male   4.0      1   
870                                  Balkic, Mr. Cerin    male  26.0      0   
871   Beckwith, Mrs. Richard Leonard (Sallie Monypeny)  female  47.0      1   
872                           Carlsson, Mr. Frans Olof    male  33.0      0   
873                        Vander Cruyssen, Mr. Victor    male  47.0      0   
874              Abelson, Mrs. Samuel (Hannah Wizosky)  female  28.0      1   
875                   Najib, Miss. Adele Kiamie "Jane"  female  15.0      0   
876                      Gustafsson, Mr. Alfred Ossian    male  20.0      0   
877                               Petroff, Mr. Nedelio    male  19.0      0   
879      Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)  female  56.0      0   
880       Shelley, Mrs. William (Imanita Parrish Hall)  female  25.0      0   
881                                 Markun, Mr. Johann    male  33.0      0   
882                       Dahlberg, Miss. Gerda Ulrika  female  22.0      0   
883                      Banfield, Mr. Frederick James    male  28.0      0   
884                             Sutehall, Mr. Henry Jr    male  25.0      0   
885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
886                              Montvila, Rev. Juozas    male  27.0      0   
887                       Graham, Miss. Margaret Edith  female  19.0      0   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch            Ticket      Fare        Cabin Embarked  
0        0         A/5 21171    7.2500          NaN        S  
1        0          PC 17599   71.2833          C85        C  
2        0  STON/O2. 3101282    7.9250          NaN        S  
3        0            113803   53.1000         C123        S  
4        0            373450    8.0500          NaN        S  
6        0             17463   51.8625          E46        S  
7        1            349909   21.0750          NaN        S  
8        2            347742   11.1333          NaN        S  
9        0            237736   30.0708          NaN        C  
10       1           PP 9549   16.7000           G6        S  
11       0            113783   26.5500         C103        S  
12       0         A/5. 2151    8.0500          NaN        S  
13       5            347082   31.2750          NaN        S  
14       0            350406    7.8542          NaN        S  
15       0            248706   16.0000          NaN        S  
16       1            382652   29.1250          NaN        Q  
18       0            345763   18.0000          NaN        S  
20       0            239865   26.0000          NaN        S  
21       0            248698   13.0000          D56        S  
22       0            330923    8.0292          NaN        Q  
23       0            113788   35.5000           A6        S  
24       1            349909   21.0750          NaN        S  
25       5            347077   31.3875          NaN        S  
27       2             19950  263.0000  C23 C25 C27        S  
30       0          PC 17601   27.7208          NaN        C  
33       0        C.A. 24579   10.5000          NaN        S  
34       0          PC 17604   82.1708          NaN        C  
35       0            113789   52.0000          NaN        S  
37       0        A./5. 2152    8.0500          NaN        S  
38       0            345764   18.0000          NaN        S  
..     ...               ...       ...          ...      ...  
856      1             36928  164.8667          NaN        S  
857      0            113055   26.5500          E17        S  
858      3              2666   19.2583          NaN        C  
860      0            350026   14.1083          NaN        S  
861      0             28134   11.5000          NaN        S  
862      0             17466   25.9292          D17        S  
864      0            233866   13.0000          NaN        S  
865      0            236852   13.0000          NaN        S  
866      0     SC/PARIS 2149   13.8583          NaN        C  
867      0          PC 17590   50.4958          A24        S  
869      1            347742   11.1333          NaN        S  
870      0            349248    7.8958          NaN        S  
871      1             11751   52.5542          D35        S  
872      0               695    5.0000  B51 B53 B55        S  
873      0            345765    9.0000          NaN        S  
874      0         P/PP 3381   24.0000          NaN        C  
875      0              2667    7.2250          NaN        C  
876      0              7534    9.8458          NaN        S  
877      0            349212    7.8958          NaN        S  
879      1             11767   83.1583          C50        C  
880      1            230433   26.0000          NaN        S  
881      0            349257    7.8958          NaN        S  
882      0              7552   10.5167          NaN        S  
883      0  C.A./SOTON 34068   10.5000          NaN        S  
884      0   SOTON/OQ 392076    7.0500          NaN        S  
885      5            382652   29.1250          NaN        Q  
886      0            211536   13.0000          NaN        S  
887      0            112053   30.0000          B42        S  
889      0            111369   30.0000         C148        C  
890      0            370376    7.7500          NaN        Q  

[714 rows x 12 columns]
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
print(titanic_survival.loc[83])
28.0
1
PassengerId                         84
Survived                             0
Pclass                               1
Name           Carrau, Mr. Francisco M
Sex                               male
Age                                 28
SibSp                                0
Parch                                0
Ticket                          113059
Fare                              47.1
Cabin                              NaN
Embarked                             S
Name: 83, dtype: object
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.loc[0:10])
     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0      0      347060   7.7750   NaN        S  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
96   male  71.0      0      0    PC 17754  34.6542    A5        C  
116  male  70.5      0      0      370369   7.7500   NaN        Q  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
54   male  65.0      0      1      113509  61.9792   B30        C  
280  male  65.0      0      0      336439   7.7500   NaN        Q  
    PassengerId  Survived  Pclass                                  Name   Sex  \
0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1           852         0       3                   Svensson, Mr. Johan  male   
2           494         0       1               Artagaveytia, Mr. Ramon  male   
3            97         0       1             Goldschmidt, Mr. George B  male   
4           117         0       3                  Connors, Mr. Patrick  male   
5           673         0       2           Mitchell, Mr. Henry Michael  male   
6           746         0       1          Crosby, Capt. Edward Gifford  male   
7            34         0       2                 Wheadon, Mr. Edward H  male   
8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9           281         0       3                      Duane, Mr. Frank  male   
10          457         0       1             Millet, Mr. Francis Davis  male   

     Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0   80.0      0      0       27042  30.0000   A23        S  
1   74.0      0      0      347060   7.7750   NaN        S  
2   71.0      0      0    PC 17609  49.5042   NaN        C  
3   71.0      0      0    PC 17754  34.6542    A5        C  
4   70.5      0      0      370369   7.7500   NaN        Q  
5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
6   70.0      1      1   WE/P 5735  71.0000   B22        S  
7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
8   65.0      0      1      113509  61.9792   B30        C  
9   65.0      0      0      336439   7.7500   NaN        Q  
10  65.0      0      0       13509  26.5500   E38        S  
# This function returns the hundredth item from a series
def hundredth_row(column):
    # Extract the hundredth item
    hundreth_item = column.iloc[99]
    return hundreth_item
# Return the hundredth item from each column
hundreth_row = titanic_survival.apply(hundredth_row)
print(hundreth_row)
PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
# By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"
classes = titanic_survival.apply(which_class,axis=1)
print(classes)
0       Third Class
1       First Class
2       Third Class
3       First Class
4       Third Class
5       Third Class
6       First Class
7       Third Class
8       Third Class
9      Second Class
10      Third Class
11      First Class
12      Third Class
13      Third Class
14      Third Class
15     Second Class
16      Third Class
17     Second Class
18      Third Class
19      Third Class
20     Second Class
21     Second Class
22      Third Class
23      First Class
24      Third Class
25      Third Class
26      Third Class
27      First Class
28      Third Class
29      Third Class
           ...     
861    Second Class
862     First Class
863     Third Class
864    Second Class
865    Second Class
866    Second Class
867     First Class
868     Third Class
869     Third Class
870     Third Class
871     First Class
872     First Class
873     Third Class
874    Second Class
875     Third Class
876     Third Class
877     Third Class
878     Third Class
879     First Class
880    Second Class
881     Third Class
882     Third Class
883    Second Class
884     Third Class
885     Third Class
886    Second Class
887     First Class
888     Third Class
889     First Class
890     Third Class
Length: 891, dtype: object
def is_minor(row):
    if row["Age"]<18:
        return True
    else:
        return False
minors = titanic_survival.apply(is_minor,axis=1)
print(minors)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age<18:
        return "minor"
    else:
        return "adult"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)
0      False
1      False
2      False
3      False
4      False
5      False
6      False
7       True
8      False
9       True
10      True
11     False
12     False
13     False
14      True
15     False
16      True
17     False
18     False
19     False
20     False
21     False
22      True
23     False
24      True
25     False
26     False
27     False
28     False
29     False
       ...  
861    False
862    False
863    False
864    False
865    False
866    False
867    False
868    False
869     True
870    False
871    False
872    False
873    False
874    False
875     True
876    False
877    False
878    False
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool
0        adult
1        adult
2        adult
3        adult
4        adult
5      unknown
6        adult
7        minor
8        adult
9        minor
10       minor
11       adult
12       adult
13       adult
14       minor
15       adult
16       minor
17     unknown
18       adult
19     unknown
20       adult
21       adult
22       minor
23       adult
24       minor
25       adult
26     unknown
27       adult
28     unknown
29     unknown
        ...   
861      adult
862      adult
863    unknown
864      adult
865      adult
866      adult
867      adult
868    unknown
869      minor
870      adult
871      adult
872      adult
873      adult
874      adult
875      minor
876      adult
877      adult
878    unknown
879      adult
880      adult
881      adult
882      adult
883      adult
884      adult
885      adult
886      adult
887      adult
888    unknown
889      adult
890      adult
Length: 891, dtype: object
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_group_survival)
            Survived
age_labels          
adult       0.381032
minor       0.539823
unknown     0.293785
posted @ 2019-07-03 08:06  Shinesu  阅读(252)  评论(0编辑  收藏  举报