二、pandas学习

1.food.csv

============================================================================================
import pandas
food_info = pandas.read_csv("food_info.csv")#object就是string类型
print(type(food_info))#DataFrame
print(food_info.dtypes)
print(help(pandas.read_csv))
==============
输出:
NDB_No               int64
Shrt_Desc           object
Water_(g)          float64
Energ_Kcal           int64
Protein_(g)        float64
.........
============================================================================================
food_info.head(3)#默认显示前五行数据,颗根据参数选择
food_info.tail()#末尾几行
print(food_info.columns)#列名
print(food_info.shape)#(8618, 36)
==============
输出:
Index(['NDB_No', 'Shrt_Desc', 'Water_..........]
============================================================================================
#取数据
print(food_info.loc[2])#打印第3行数据
print(food_info.loc[2:3])#打印2到3行
print(food_info.loc[[2,3,5]])#打印第2/3/5行数据
============================================================================================
#按列取数据  如果去两列,就写两个列名,用,隔开
#ndb_col = food_info["NDB_No"]#取列数据
#print(ndb_col)
col_name = "NDB_No"
print(food_info["NDB_No"])#打印出"NDB_No”的列项
print(food_info.shape)
#对所有的列元素操作
print(food_info["NDB_No"]/1000)
#对应元素的运算

water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"]
iron_gram = food_info["Iron_(mg)"]/1000
food_info["Iron_(temg)"] = iron_gram#增加了一列数据 ,增加前是36列 增加后变成37列
print(food_info.shape)
max_Water = food_info["Iron_(mg)"].max()#取这一列的最大值
print(max_Water)
==============
输出:
0        1001
1        1002
2        1003
3        1004
....................
=========================================================================================
col_names = food_info.columns.tolist()#取所有列名
print(col_names)
gram_columns = []

for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)#存储以g为结尾的列名
print("====================")
print(gram_columns)
print("====================")
gram_df = food_info[gram_columns]
print(gram_df.head(3))
输出:
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)', 'Iron_(g)', 'Iron_(temg)']
====================
['Water_(g)', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Iron_(g)']
====================
   Water_(g)  Protein_(g)  Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  \
0      15.87         0.85          81.11     2.11            0.06   
1      15.87         0.85          81.11     2.11            0.06   
2       0.24         0.28          99.48     0.00            0.00   

   Fiber_TD_(g)  Sugar_Tot_(g)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  Iron_(g)  
0           0.0           0.06      51.368       21.021        3.043   0.00002  
1           0.0           0.06      50.489       23.426        3.012   0.00016  
2           0.0           0.00      61.924       28.732        3.694   0.00000
======================================================================================
#排序问题
food_info.sort_values("Water_(g)",inplace = True)#在原位置排序,从小到大排序,升序
print(food_info["Water_(g)"])
food_info.sort_values("Water_(g)",inplace = True,ascending=False)#在原位置排序,降序
print(food_info["Water_(g)"])

2.tatanic.csv

====================================================================================
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()#默认打印5行

================================================================================
age = titanic_survival["Age"]#定位到age
print(age.loc[0:5])#打印0--5的值
age_is_null = pd.isnull(age)
print(age_is_null)
print("===============")
age_null_true = age[age_is_null]
print(age_null_true)
======================
输出:
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
Name: Age, dtype: float64
0      False
1      False
2      False
3      False
4      False
5       True
6      False
............
=========================
5     NaN
17    NaN
19    NaN
26    NaN
================================================================================
mean_age = sum(titanic_survival["Age"])/len(titanic_survival[["Age"]])
print(mean_age)#当有缺失值的时候,无法进行计算
输出:
nan
================================================================================
good_ages = titanic_survival["Age"][age_is_null == False]#去掉缺失值
print(good_ages)
correct_mean_age = sum(good_ages)/len(good_ages)#求均值
print(correct_mean_age)

correct_mean_age = titanic_survival["Age"].mean()#求均值
print(correct_mean_age)
================================================================================
#功能:计算每个等级的船舱的平均价位
passenger_class = [1,2,3]
fares_by_class = {}
for this_class in passenger_class:
    plass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]#保存一等船舱的数据
    pclass_fares = plass_rows["Fare"]#取出数据中Fare列所有值
    fare_for_class = pclass_fares.mean()#对所有数据求均值
    fares_by_class[this_class] = fare_for_class#保存每个等级的均值
print(fares_by_class)
输出:
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
================================================================================
passenger_survival = titanic_survival.pivot_table(index = "Pclass",values="Survived",aggfunc=np.mean)#index:统计的基准,value:index根什么有关系,
print(passenger_survival)
输出:
        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
================================================================================
passenger_survival = titanic_survival.pivot_table(index = "Pclass",values=["Fare","Survived"],aggfunc=np.mean)
print(passenger_survival)

输出:
             Fare  Survived
Pclass                     
1       84.154687  0.629630
2       20.662183  0.472826
3       13.675550  0.242363
================================================================================
#缺失值丢掉
drop_na_columns = titanic_survival.dropna(axis=1)#对纵轴为空的列进行丢弃
print(drop_na_columns)
new_tatanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"])
print(new_tatanic_survival)
new_tanic_survival = titanic_survival.loc[83,"Pclass"]#找出某一个值
print(new_tanic_survival)
================================================================================
new_tatanic_survival = titanic_survival.sort_values("Age",ascending = False)
print(new_tatanic_survival[0:10])
re_tatanic_survival = new_tatanic_survival.reset_index(drop = True)#原来的index索引不要了,重新排
print(re_tatanic_survival)
================================================================================
#定义函数:返回第100行数据
def hundredth_row(column):
    hundredth_item = column.loc[99]
   # print(hundredth_item)
    return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)#调用函数 打印第一百行数据
print(hundredth_row)

 

posted @ 2018-10-16 23:11  camelundergo  阅读(128)  评论(0编辑  收藏  举报