import numpy as np import pandas as pd a=pd.read_csv("titanic_train.csv") a.head()
age=a["Age"] #print(age.loc[0:10]) age_is_null=pd.isnull(age)#pandas.isnull()判断是不是缺失值 #print(age_is_null) age_null_true=age[age_is_null]#将age里的缺失值都提出来 #print(age_null_true) age_null_count=len(age_null_true)#确定缺失值的长度 #print(age_null_count) good_ages=a["Age"][age_is_null==False] #去掉Age中的缺失值 #print(good_ages) mean_good_ages=sum(good_ages)/len(good_ages)#算出去掉缺失值的平均年龄 print(mean_good_ages) >>> 29.6991176471 correct_mean_age=a["Age"].mean()#.mean()求平均值(无视缺失值) print(correct_mean_age) >>> 29.69911764705882 passenger_pclasses=[1,2,3] #船舱有三种 pclass_by_fare={} #定义一个空的字典,用来存船舱的平均价格 for this_class in passenger_pclasses: x=a[a['Pclass']==this_class] #将全部一等舱的数据数据提取出来(循环,完了还有二等,三等) y=x['Fare'].mean() #求一等舱价格的平均值 pclass_by_fare[this_class]=y #将一等舱及平均票价放在空字典中 print(pclass_by_fare) >>> {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
passenger_survived=a.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean)#.pivot_table(index=以什么为基中,values=以什么为判断,aggfunc=判断函数)用于统计 print(passenger_survived)#每种舱平均获救 >>> Survived Pclass 1 0.629630 2 0.472826 3 0.242363 passenger_age=a.pivot_table(index='Pclass',values='Age')#aggfunc不定义,默认求均值 print(passenger_age)#每种舱平均年龄 >>> Age Pclass 1 38.233441 2 29.877630 3 25.140620 port_stats=a.pivot_table(index='Embarked',values=['Fare','Survived'],aggfunc=np.sum) print(port_stats)#不同港口票价总和,生存人数总和 >>> Fare Survived Embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217
new_a=a.dropna(axis=0,subset=["Age","Sex"])#.dropna(axis=0按列,subset=去掉"Age","Sex"缺失的样本)去掉缺失值 #print(new_a) print(len(new_a)) len(a) >>> 714 891 row_index_83_age=a.loc[83,"Age"]#读取第83个样本的"Age"的值 print(row_index_83_age) >>> 28.0
new_a2=a.sort_values("Age",ascending=False)#以"Age"为准降序,ascending=Fale升序为假,所以是降序咯 print(new_a2[0:5])
a_reindex=a.reset_index(drop=True)#更新索引值 print(a_reindex[0:5])
# This function returns the hundredth item from a series def hundredth_row(column): # Extract the hundredth item hundredth_item = column.loc[99] return hundredth_item # Return the hundredth item from each column hundredth_row = titanic_survival.apply(hundredth_row) print (hundredth_row)
def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) print (column_null_count)