Pandas基础

目录

  读取数据

  索引与计算

  数据处理——kaggle泰坦尼克号


 

读取数据

a.csv

name,age,height
Tom0,15,156.3
Tom1,17,162.6
Tom2,12,165.7
Tom3,15,134.3
Tom4,27,174.5
Tom5,56,176.6
Tom6,21,172.5
Tom7,53,172.1

 

# -*- coding: utf-8 -*-
import pandas as pd

a = pd.read_csv("a.csv")
print(type(a))
# <class 'pandas.core.frame.DataFrame'>
print(a.dtypes)
# name       object
# age         int64
# height    float64
# dtype: object
print(a)
#    name  age  height
# 0  Tom0   15   156.3
# 1  Tom1   17   162.6
# 2  Tom2   12   165.7
# 3  Tom3   15   134.3
# 4  Tom4   27   174.5
# 5  Tom5   56   176.6
# 6  Tom6   21   172.5
# 7  Tom7   53   172.1
print(a.head(2)) 
#    name  age  height
# 0  Tom0   15   156.3
# 1  Tom1   17   162.6
print(a.tail(2)) 
#    name  age  height
# 6  Tom6   21   172.5
# 7  Tom7   53   172.1
print(a.columns)
# Index(['name', 'age', 'height'], dtype='object')
print(a.shape)
# (8, 3)

  

 返回目录

 

索引与计算 

 

# -*- coding: utf-8 -*-
import pandas as pd

a = pd.read_csv("a.csv")

print(a.loc[0]) #按索引取数据
# name       Tom0
# age          15
# height    156.3
# Name: 0, dtype: object

#数据切片
print(a.loc[2:4]) 
#    name  age  height
# 2  Tom2   12   165.7
# 3  Tom3   15   134.3
# 4  Tom4   27   174.5

#打印某列
print(a["name"]) 
# 0    Tom0
# 1    Tom1
# 2    Tom2
# 3    Tom3
# 4    Tom4
# 5    Tom5
# 6    Tom6
# 7    Tom7
print(a[["name","age"]]) 
#    name  age
# 0  Tom0   15
# 1  Tom1   17
# 2  Tom2   12
# 3  Tom3   15
# 4  Tom4   27
# 5  Tom5   56
# 6  Tom6   21
# 7  Tom7   53

#获取列明列表
col_lst = a.columns.tolist()
print(col_lst)
# ['name', 'age', 'height']

#四则运算
print(a["height"]/100)
# 0    1.563
# 1    1.626
# 2    1.657
# 3    1.343
# 4    1.745
# 5    1.766
# 6    1.725
# 7    1.721
# Name: height, dtype: float64

#增广DataFrame
print(a)
#    name  age  height
# 0  Tom0   15   156.3
# 1  Tom1   17   162.6
# 2  Tom2   12   165.7
# 3  Tom3   15   134.3
# 4  Tom4   27   174.5
# 5  Tom5   56   176.6
# 6  Tom6   21   172.5
# 7  Tom7   53   172.1
print(a.shape)
# (8, 3)
t = a["height"]/100
a["height(m)"] = t
print(a)
#    name  age  height  height(m)
# 0  Tom0   15   156.3      1.563
# 1  Tom1   17   162.6      1.626
# 2  Tom2   12   165.7      1.657
# 3  Tom3   15   134.3      1.343
# 4  Tom4   27   174.5      1.745
# 5  Tom5   56   176.6      1.766
# 6  Tom6   21   172.5      1.725
# 7  Tom7   53   172.1      1.721
print(a.shape)
# (8, 4)

#寻找最大值
print(a["height"].max())
# 176.6

 

 返回目录

 

数据处理——kaggle泰坦尼克号 

 

 

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np


a = pd.read_csv("titanic_train.csv")
#排序
b = a.sort_values("Age",ascending=True)
print(b.head())
#      PassengerId  Survived  Pclass                             Name     Sex  \
# 803          804         1       3  Thomas, Master. Assad Alexander    male   
# 755          756         1       2        Hamalainen, Master. Viljo    male   
# 644          645         1       3           Baclini, Miss. Eugenie  female   
# 469          470         1       3    Baclini, Miss. Helene Barbara  female   
# 78            79         1       2    Caldwell, Master. Alden Gates    male   
# 
#       Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
# 803  0.42      0      1    2625   8.5167   NaN        C  
# 755  0.67      1      1  250649  14.5000   NaN        S  
# 644  0.75      2      1    2666  19.2583   NaN        C  
# 469  0.75      2      1    2666  19.2583   NaN        C  
# 78   0.83      0      2  248738  29.0000   NaN        S  

age = a["Age"]
print(age.head(10))
# 0     22.0
# 1     38.0
# 2     26.0
# 3     35.0
# 4     35.0
# 5      NaN
# 6     54.0
# 7      2.0
# 8     27.0
# 9     14.0
# Name: Age, dtype: float64

age_is_null = pd.isnull(age) #是否是缺失值
print(age_is_null.head(10))
# 0    False
# 1    False
# 2    False
# 3    False
# 4    False
# 5     True
# 6    False
# 7    False
# 8    False
# 9    False
# Name: Age, dtype: bool

#查询共有多少缺失值
b = age[age_is_null] #age_is_null做索引,取出age为空的样本
print(b.head())
# 5    NaN
# 17   NaN
# 19   NaN
# 26   NaN
# 28   NaN
# Name: Age, dtype: float64
print(len(b))
# 177

#求年龄的平均值
c = age[age_is_null == False]
print(c.sum()/len(c)) #方法一
# 29.69911764705882
print(age.mean())  #方法二
# 29.69911764705882


# 查看不同船舱的平均价格
# 方法一:
levels = [1,2,3]
Pclass = a["Pclass"] #船舱等级
Fare = a["Fare"] #价格
print(type(Fare))
# <class 'pandas.core.series.Series'>
print(Pclass.head())
# 0    3
# 1    1
# 2    3
# 3    1
# 4    3
print(Fare.head())
# 0     7.2500
# 1    71.2833
# 2     7.9250
# 3    53.1000
# 4     8.0500
fare_dic = {}
for level in levels:
    fare_dic[level] = Fare[Pclass == level].mean()
print(fare_dic)
# {1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
# 方法二:
b = a.pivot_table(index = "Pclass", values = "Fare", aggfunc = np.mean)
print(b)
#              Fare
# Pclass           
# 1       84.154687
# 2       20.662183
# 3       13.675550


# 查看不同等级船舱的平均获救人数
b = a.pivot_table(index = "Pclass", values = "Survived", aggfunc = np.mean)
print(b)
#         Survived
# Pclass          
# 1       0.629630
# 2       0.472826
# 3       0.242363

# 查看不同等级船舱的平均年龄
b = a.pivot_table(index = "Pclass", values = ["Survived","Age"], aggfunc = np.mean)
print(b)
#               Age  Survived
# Pclass                     
# 1       38.233441  0.629630
# 2       29.877630  0.472826
# 3       25.140620  0.242363

# 丢弃缺失数据
print(a.shape)
#(891, 12)
b = a.dropna(axis=0) #丢弃掉0轴上有缺失的
print(b.shape)
# (183, 12)
b = a.dropna(axis=1) #丢弃掉1轴上有缺失的
print(b.shape)
# (891, 9)
b = a.dropna(axis=0, subset=["Age","Embarked"]) #丢弃掉0轴上,"Age","Embarked"有缺失的
print(b.shape)
# (712, 12)

#定位
print(a.head(1)) #查看第1个人的年龄
print(a.loc[0,"Age"]) #查看第1个人的年龄
#    PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  \
# 0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1   
# 
#    Parch     Ticket  Fare Cabin Embarked  
# 0      0  A/5 21171  7.25   NaN        S  
# 22.0

 

 返回目录

 

posted @ 2017-12-30 17:15  黎明程序员  阅读(257)  评论(0编辑  收藏  举报