二、pandas学习
1.food.csv
============================================================================================ import pandas food_info = pandas.read_csv("food_info.csv")#object就是string类型 print(type(food_info))#DataFrame print(food_info.dtypes) print(help(pandas.read_csv)) ============== 输出: NDB_No int64 Shrt_Desc object Water_(g) float64 Energ_Kcal int64 Protein_(g) float64 ......... ============================================================================================ food_info.head(3)#默认显示前五行数据,颗根据参数选择 food_info.tail()#末尾几行 print(food_info.columns)#列名 print(food_info.shape)#(8618, 36) ============== 输出: Index(['NDB_No', 'Shrt_Desc', 'Water_..........] ============================================================================================ #取数据 print(food_info.loc[2])#打印第3行数据 print(food_info.loc[2:3])#打印2到3行 print(food_info.loc[[2,3,5]])#打印第2/3/5行数据 ============================================================================================ #按列取数据 如果去两列,就写两个列名,用,隔开 #ndb_col = food_info["NDB_No"]#取列数据 #print(ndb_col) col_name = "NDB_No" print(food_info["NDB_No"])#打印出"NDB_No”的列项 print(food_info.shape) #对所有的列元素操作 print(food_info["NDB_No"]/1000) #对应元素的运算 water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"] iron_gram = food_info["Iron_(mg)"]/1000 food_info["Iron_(temg)"] = iron_gram#增加了一列数据 ,增加前是36列 增加后变成37列 print(food_info.shape) max_Water = food_info["Iron_(mg)"].max()#取这一列的最大值 print(max_Water) ============== 输出: 0 1001 1 1002 2 1003 3 1004 .................... ========================================================================================= col_names = food_info.columns.tolist()#取所有列名 print(col_names) gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c)#存储以g为结尾的列名 print("====================") print(gram_columns) print("====================") gram_df = food_info[gram_columns] print(gram_df.head(3)) 输出: ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)', 'Iron_(g)', 'Iron_(temg)'] ==================== ['Water_(g)', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Iron_(g)'] ==================== Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) \ 0 15.87 0.85 81.11 2.11 0.06 1 15.87 0.85 81.11 2.11 0.06 2 0.24 0.28 99.48 0.00 0.00 Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Iron_(g) 0 0.0 0.06 51.368 21.021 3.043 0.00002 1 0.0 0.06 50.489 23.426 3.012 0.00016 2 0.0 0.00 61.924 28.732 3.694 0.00000 ====================================================================================== #排序问题 food_info.sort_values("Water_(g)",inplace = True)#在原位置排序,从小到大排序,升序 print(food_info["Water_(g)"]) food_info.sort_values("Water_(g)",inplace = True,ascending=False)#在原位置排序,降序 print(food_info["Water_(g)"])
2.tatanic.csv
==================================================================================== import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") titanic_survival.head()#默认打印5行 ================================================================================ age = titanic_survival["Age"]#定位到age print(age.loc[0:5])#打印0--5的值 age_is_null = pd.isnull(age) print(age_is_null) print("===============") age_null_true = age[age_is_null] print(age_null_true) ====================== 输出: 0 22.0 1 38.0 2 26.0 3 35.0 4 35.0 5 NaN Name: Age, dtype: float64 0 False 1 False 2 False 3 False 4 False 5 True 6 False ............ ========================= 5 NaN 17 NaN 19 NaN 26 NaN ================================================================================ mean_age = sum(titanic_survival["Age"])/len(titanic_survival[["Age"]]) print(mean_age)#当有缺失值的时候,无法进行计算 输出: nan ================================================================================ good_ages = titanic_survival["Age"][age_is_null == False]#去掉缺失值 print(good_ages) correct_mean_age = sum(good_ages)/len(good_ages)#求均值 print(correct_mean_age) correct_mean_age = titanic_survival["Age"].mean()#求均值 print(correct_mean_age) ================================================================================ #功能:计算每个等级的船舱的平均价位 passenger_class = [1,2,3] fares_by_class = {} for this_class in passenger_class: plass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]#保存一等船舱的数据 pclass_fares = plass_rows["Fare"]#取出数据中Fare列所有值 fare_for_class = pclass_fares.mean()#对所有数据求均值 fares_by_class[this_class] = fare_for_class#保存每个等级的均值 print(fares_by_class) 输出: {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997} ================================================================================ passenger_survival = titanic_survival.pivot_table(index = "Pclass",values="Survived",aggfunc=np.mean)#index:统计的基准,value:index根什么有关系, print(passenger_survival) 输出: Survived Pclass 1 0.629630 2 0.472826 3 0.242363 ================================================================================ passenger_survival = titanic_survival.pivot_table(index = "Pclass",values=["Fare","Survived"],aggfunc=np.mean) print(passenger_survival) 输出: Fare Survived Pclass 1 84.154687 0.629630 2 20.662183 0.472826 3 13.675550 0.242363 ================================================================================ #缺失值丢掉 drop_na_columns = titanic_survival.dropna(axis=1)#对纵轴为空的列进行丢弃 print(drop_na_columns) new_tatanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) print(new_tatanic_survival) new_tanic_survival = titanic_survival.loc[83,"Pclass"]#找出某一个值 print(new_tanic_survival) ================================================================================ new_tatanic_survival = titanic_survival.sort_values("Age",ascending = False) print(new_tatanic_survival[0:10]) re_tatanic_survival = new_tatanic_survival.reset_index(drop = True)#原来的index索引不要了,重新排 print(re_tatanic_survival) ================================================================================ #定义函数:返回第100行数据 def hundredth_row(column): hundredth_item = column.loc[99] # print(hundredth_item) return hundredth_item hundredth_row = titanic_survival.apply(hundredth_row)#调用函数 打印第一百行数据 print(hundredth_row)