pandas

import pandas as pd
import numpy as np

titanic_survival = pd.read_csv("titanic_train.csv")
#读取titanic_train.csv,并将该变量储存为titanic_survival

age = titanic_survival["Age"]
print(age.loc[0:10])
#将Age这一列定义为变量age,并将这一列的前10个元素print出来

age_is_null = pd.isnull(age)
#判断age这一列是否是一个缺失值,缺失显示为True,不缺失显示为False,可以用作为索引
print(age_is_null)

age_null_true = age[age_is_null]
#注意,这里只有True的值会传入进去,而False并不会,因此就将缺失值给筛选出来了
print(age_null_true)
age_null_count = len(age_null_true)
print (age_null_count)

mean_age = sum(titanic_survival["Age"])/len(titanic_survival["Age"])
print(mean_age)
#这时候会显示NAN 因为里面有缺失值

good_ages = titanic_survival["Age"][age_is_null == False]
mean_age = sum(good_ages)/len(good_ages)
print(mean_age)
#这一部就是讲没有缺失的那些值给筛选出来,定义未一个新的变量定义为good_age

correct_mean = titanic_survival["Age"].mean()
print(correct_mean)
#简便的算法

 

passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class]=fare_for_class
print(fares_by_class)
#写一个循环,将不同船舱的均价给算出来

passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)
#利用pivot_table函数来简便计算

passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age)
#利用pivot_table函数来简便计算,后面的aggfunc不指定就代表求平均值

port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)
#统计出来不同的登船口的船费合计,以及获救人数

drop_na_columns = titanic_survival.dropna(axis=1)
#下去搜dateframe.dropna,这个函数,这个是只要列里面有NA值就会将这一行给drop掉,其中axis=1代表对象是列
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"])
#对象是行,然后只要index: age sex中有NA这一行就会被drop掉
print(drop_na_columns)
print(new_titanic_survival)

row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_766_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_766_pclass)
#利用loc函数定位到精确的某一行某一列

new2_titanic_survival = titanic_survival.sort_values("Age",ascending = False)
print(new2_titanic_survival[0:10])
#将数据,以Age变量,做降序处理
titanic_reindexed = new2_titanic_survival.reset_index(drop=True)
print("---------------")
print(titanic_reindexed.loc[0:10])
#下面那个代表我现在想把他前面的序号从新排列

posted @ 2018-01-01 22:18  仔仔爱python  阅读(236)  评论(0编辑  收藏  举报