【Python】Pandas 笔记

 

pandas-data

 

 

 

1. Pandas 数据结构

1.1 Series 一维数组

In [1]:
import numpy as np
import pandas as pd
In [7]:
'''
创建序列的三种方式
    1. 通过一维数组
    2. 通过字典
    3. 通过 DataFrame 的行或列
'''
arr = np.array([1, 2, 3, 4, 5])
print(arr)
s1 = pd.Series(arr)
print(s1)
print(type(s1))

dic = {'name': 'Bob', 'age': 32, 'gender': 'man'}
s2 = pd.Series(dic)
print(s2)
 
[1 2 3 4 5]
0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>
name      Bob
age        32
gender    man
dtype: object
 

1.2 DataFrame 二维数组

In [23]:
'''
创建 DataFrame 的三种方式
    1. 通过 二维数组 创建
    2. 通过字典创建
    3. 通过数据框
'''
arr = np.arange(20).reshape((4, 5))
df1 = pd.DataFrame(arr)
print(df1)
print()
df2 = pd.DataFrame(arr, columns=['a', 'b', 'c', 'd', 'e'])
print(df2)
 
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
In [15]:
dic = {'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8], 'c':[9, 10, 11, 12]}
df3 = pd.DataFrame(dic)
print(df3)
 
   a  b   c
0  1  5   9
1  2  6  10
2  3  7  11
3  4  8  12
In [16]:
dic = {'one':{'a':1,'b':2,'c':3,'d':4},'two':{'a':5,'b':6,'c':7,'d':8},'three':{'a':9,'b':10,'c':11,'d':12}}
df4 = pd.DataFrame(dic)
print(df4)
 
   one  two  three
a    1    5      9
b    2    6     10
c    3    7     11
d    4    8     12
In [18]:
df5 = df4[['one', 'three']]
print(df5)
 
   one  three
a    1      9
b    2     10
c    3     11
d    4     12
 

2. 数据索引

2.1 通过索引获取数据

In [24]:
# Series
arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
s1 = pd.Series(arr)
print(s1[1])
print()

dic = {'name': 'Bob', 'age': 32, 'gender': 'man'}
s2 = pd.Series(dic)
print(s2['age'])
print()

print(s2[['name', 'gender']])
 
b

32

name      Bob
gender    man
dtype: object
In [22]:
# DataFrame
dic = {'one':{'a':1,'b':2,'c':3,'d':4},'two':{'a':5,'b':6,'c':7,'d':8},'three':{'a':9,'b':10,'c':11,'d':12}}
df4 = pd.DataFrame(dic)
print(df4)
print()
print(df4['one'])
print()
print(df4[['one','three']])
 
   one  two  three
a    1    5      9
b    2    6     10
c    3    7     11
d    4    8     12

a    1
b    2
c    3
d    4
Name: one, dtype: int64

   one  three
a    1      9
b    2     10
c    3     11
d    4     12
 

3.利用 Pandas 查询数据

In [26]:
# 准备数据
stu_dic = {'Age':[14,13,13,14,14,12,12,15,13,12,11,14,12,15,16,12,15,11,15],
'Height':[69,56.5,65.3,62.8,63.5,57.3,59.8,62.5,62.5,59,51.3,64.3,56.3,66.5,72,64.8,67,57.5,66.5],
'Name':['Alfred','Alice','Barbara','Carol','Henry','James','Jane','Janet','Jeffrey','John','Joyce','Judy','Louise','Marry','Philip','Robert','Ronald','Thomas','Willam'],
'Gender':['M','F','F','F','M','M','F','F','M','M','F','F','F','F','M','M','M','M','M'],
'Weight':[112.5,84,98,102.5,102.5,83,84.5,112.5,84,99.5,50.5,90,77,112,150,128,133,85,112]}

students = pd.DataFrame(stu_dic, columns=['Name', 'Age', 'Gender', 'Height', 'Weight'])
print(students)
 
       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
1     Alice   13      F    56.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
5     James   12      M    57.3    83.0
6      Jane   12      F    59.8    84.5
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
10    Joyce   11      F    51.3    50.5
11     Judy   14      F    64.3    90.0
12   Louise   12      F    56.3    77.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
In [27]:
# 查看前五行
print(students.head())
print()
# 查看末五行
print(students.tail())
 
      Name  Age Gender  Height  Weight
0   Alfred   14      M    69.0   112.5
1    Alice   13      F    56.5    84.0
2  Barbara   13      F    65.3    98.0
3    Carol   14      F    62.8   102.5
4    Henry   14      M    63.5   102.5

      Name  Age Gender  Height  Weight
14  Philip   16      M    72.0   150.0
15  Robert   12      M    64.8   128.0
16  Ronald   15      M    67.0   133.0
17  Thomas   11      M    57.5    85.0
18  Willam   15      M    66.5   112.0
In [31]:
# 索引指定的行
print(students.loc[[0, 4, 5, 8]])
print()

# 索引指定的列
print(students[['Name', 'Height', 'Weight']])
 
      Name  Age Gender  Height  Weight
0   Alfred   14      M    69.0   112.5
4    Henry   14      M    63.5   102.5
5    James   12      M    57.3    83.0
8  Jeffrey   13      M    62.5    84.0

       Name  Height  Weight
0    Alfred    69.0   112.5
1     Alice    56.5    84.0
2   Barbara    65.3    98.0
3     Carol    62.8   102.5
4     Henry    63.5   102.5
5     James    57.3    83.0
6      Jane    59.8    84.5
7     Janet    62.5   112.5
8   Jeffrey    62.5    84.0
9      John    59.0    99.5
10    Joyce    51.3    50.5
11     Judy    64.3    90.0
12   Louise    56.3    77.0
13    Marry    66.5   112.0
14   Philip    72.0   150.0
15   Robert    64.8   128.0
16   Ronald    67.0   133.0
17   Thomas    57.5    85.0
18   Willam    66.5   112.0
In [34]:
# 筛选出所有十二岁以上的女生
print(students[(students['Gender'] == 'F') & (students['Age'] > 12) ])
 
       Name  Age Gender  Height  Weight
1     Alice   13      F    56.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
7     Janet   15      F    62.5   112.5
11     Judy   14      F    64.3    90.0
13    Marry   15      F    66.5   112.0
In [35]:
# 列出所有十二岁以上女生的姓名身高和体重
print(students[(students['Gender'] == 'F') & (students['Age'] > 12) ][['Name', 'Height', 'Weight']])
 
       Name  Height  Weight
1     Alice    56.5    84.0
2   Barbara    65.3    98.0
3     Carol    62.8   102.5
7     Janet    62.5   112.5
11     Judy    64.3    90.0
13    Marry    66.5   112.0
 

如果是多个条件的查询,必须在&(且)或者|(或)的两端条件用括号括起来。

 

4. 利用 Pandas 进行数据统计分析

In [36]:
# 准备数据
stu_dic = {'Age':[14,13,13,14,14,12,12,15,13,12,11,14,12,15,16,12,15,11,15],
'Height':[69,56.5,65.3,62.8,63.5,57.3,59.8,62.5,62.5,59,51.3,64.3,56.3,66.5,72,64.8,67,57.5,66.5],
'Name':['Alfred','Alice','Barbara','Carol','Henry','James','Jane','Janet','Jeffrey','John','Joyce','Judy','Louise','Marry','Philip','Robert','Ronald','Thomas','Willam'],
'Gender':['M','F','F','F','M','M','F','F','M','M','F','F','F','F','M','M','M','M','M'],
'Weight':[112.5,84,98,102.5,102.5,83,84.5,112.5,84,99.5,50.5,90,77,112,150,128,133,85,112]}

students = pd.DataFrame(stu_dic, columns=['Name', 'Age', 'Gender', 'Height', 'Weight'])
print(students)
 
       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
1     Alice   13      F    56.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
5     James   12      M    57.3    83.0
6      Jane   12      F    59.8    84.5
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
10    Joyce   11      F    51.3    50.5
11     Judy   14      F    64.3    90.0
12   Louise   12      F    56.3    77.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
 

4.1 数据量、求和、均值、最值

In [42]:
print("数据量: ")
print(students.count(), '\n')

print("求和: ")    # 字符类型求和结果是拼接
print(students.sum(), '\n')

print("均值: ")
print(students[['Age', 'Height', 'Weight']].mean(), '\n')

print("最大值:")
print(students[['Age', 'Height', 'Weight']].max(), '\n')

print("最小值:")
print(students[['Age', 'Height', 'Weight']].min(), '\n')

print("最大值的索引:")
print(students[['Age', 'Height', 'Weight']].idxmax(), '\n')

print("最小值的索引:")
print(students[['Age', 'Height', 'Weight']].idxmin(), '\n')
 
数据量: 
Name      19
Age       19
Gender    19
Height    19
Weight    19
dtype: int64 

求和: 
Name      AlfredAliceBarbaraCarolHenryJamesJaneJanetJeff...
Age                                                     253
Gender                                  MFFFMMFFMMFFFFMMMMM
Height                                               1184.4
Weight                                               1900.5
dtype: object 

均值: 
Age        13.315789
Height     62.336842
Weight    100.026316
dtype: float64 

最大值:
Age        16.0
Height     72.0
Weight    150.0
dtype: float64 

最小值:
Age       11.0
Height    51.3
Weight    50.5
dtype: float64 

最大值的索引:
Age       14
Height    14
Weight    14
dtype: int64 

最小值的索引:
Age       10
Height    10
Weight    10
dtype: int64 

 

4.2 中位数、 众数、 方差、 分位数

In [43]:
print("中位数: ")
print(students[['Age', 'Height', 'Weight']].median(), '\n')

print("众数: ")
print(students[['Age', 'Height', 'Weight']].mode(), '\n')

print("方差: ")
print(students[['Age', 'Height', 'Weight']].var(), '\n')

print("10% 分位数")
print(students[['Age', 'Height', 'Weight']].quantile(0.1), '\n')
 
中位数: 
Age       13.0
Height    62.8
Weight    99.5
dtype: float64 

众数: 
    Age  Height  Weight
0  12.0    62.5    84.0
1   NaN    66.5   102.5
2   NaN     NaN   112.0
3   NaN     NaN   112.5 

方差: 
Age         2.228070
Height     26.286901
Weight    518.652047
dtype: float64 

10% 分位数
Age       11.80
Height    56.46
Weight    81.80
Name: 0.1, dtype: float64 

 

4.3 标准差、平均绝对偏差、偏度、峰度

In [44]:
print("标准差: ")
print(students[['Age', 'Height', 'Weight']].std(), '\n')

print("平均绝对偏差: ")
print(students[['Age', 'Height', 'Weight']].mad(), '\n')

print("偏度: ")
print(students[['Age', 'Height', 'Weight']].skew(), '\n')

print("峰度: ")
print(students[['Age', 'Height', 'Weight']].kurt(), '\n')
 
标准差: 
Age        1.492672
Height     5.127075
Weight    22.773933
dtype: float64 

平均绝对偏差: 
Age        1.279778
Height     4.069252
Weight    17.343490
dtype: float64 

偏度: 
Age       0.063612
Height   -0.259670
Weight    0.183351
dtype: float64 

峰度: 
Age      -1.110926
Height   -0.138969
Weight    0.683365
dtype: float64 

In [45]:
# 概览
print(students.describe())
 
             Age     Height      Weight
count  19.000000  19.000000   19.000000
mean   13.315789  62.336842  100.026316
std     1.492672   5.127075   22.773933
min    11.000000  51.300000   50.500000
25%    12.000000  58.250000   84.250000
50%    13.000000  62.800000   99.500000
75%    14.500000  65.900000  112.250000
max    16.000000  72.000000  150.000000
 

4.4 自定义数值统计 apply()

In [47]:
def state(x):
    return pd.Series([x.std(), x.mean(), x.count(), x.max(), x.min()],
                    index=['std', 'mean', 'count', 'max', 'min'])
students[['Age', 'Height', 'Weight']].apply(state)
Out[47]:
 
 AgeHeightWeight
std 1.492672 5.127075 22.773933
mean 13.315789 62.336842 100.026316
count 19.000000 19.000000 19.000000
max 16.000000 72.000000 150.000000
min 11.000000 51.300000 50.500000
 

5. 使用 Pandas 进行类 SQL 操作

 

5.1 新增

In [50]:
# 新增行或列
add = [{'Name': 'Alice', 'Age': 15, 'Gender': 'F', 'Height': 130, 'Weight': 88}, 
       {'Name': 'James', 'Age': 13, 'Gender': 'M', 'Height': 100, 'Weight': 120}]
df = pd.DataFrame(add)
print(df, '\n')


print(pd.concat([students, df]))
 
    Name  Age Gender  Height  Weight
0  Alice   15      F     130      88
1  James   13      M     100     120 

       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
1     Alice   13      F    56.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
5     James   12      M    57.3    83.0
6      Jane   12      F    59.8    84.5
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
10    Joyce   11      F    51.3    50.5
11     Judy   14      F    64.3    90.0
12   Louise   12      F    56.3    77.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
0     Alice   15      F   130.0    88.0
1     James   13      M   100.0   120.0
In [51]:
# 重建索引
print(pd.concat([students, df], ignore_index=True))
 
       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
1     Alice   13      F    56.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
5     James   12      M    57.3    83.0
6      Jane   12      F    59.8    84.5
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
10    Joyce   11      F    51.3    50.5
11     Judy   14      F    64.3    90.0
12   Louise   12      F    56.3    77.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
19    Alice   15      F   130.0    88.0
20    James   13      M   100.0   120.0
In [53]:
print(pd.DataFrame(students, columns=['Name', 'Age', 'Gender', 'Height', 'Weight', 'score']))
 
       Name  Age Gender  Height  Weight  score
0    Alfred   14      M    69.0   112.5    NaN
1     Alice   13      F    56.5    84.0    NaN
2   Barbara   13      F    65.3    98.0    NaN
3     Carol   14      F    62.8   102.5    NaN
4     Henry   14      M    63.5   102.5    NaN
5     James   12      M    57.3    83.0    NaN
6      Jane   12      F    59.8    84.5    NaN
7     Janet   15      F    62.5   112.5    NaN
8   Jeffrey   13      M    62.5    84.0    NaN
9      John   12      M    59.0    99.5    NaN
10    Joyce   11      F    51.3    50.5    NaN
11     Judy   14      F    64.3    90.0    NaN
12   Louise   12      F    56.3    77.0    NaN
13    Marry   15      F    66.5   112.0    NaN
14   Philip   16      M    72.0   150.0    NaN
15   Robert   12      M    64.8   128.0    NaN
16   Ronald   15      M    67.0   133.0    NaN
17   Thomas   11      M    57.5    85.0    NaN
18   Willam   15      M    66.5   112.0    NaN
 

5.2 删除

In [56]:
# 删除行
print(students.drop([2, 3, 5]))
 
       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
1     Alice   13      F    56.5    84.0
4     Henry   14      M    63.5   102.5
6      Jane   12      F    59.8    84.5
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
10    Joyce   11      F    51.3    50.5
11     Judy   14      F    64.3    90.0
12   Louise   12      F    56.3    77.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
In [57]:
# 删除列
print(students.drop(columns=['Age']))
 
       Name Gender  Height  Weight
0    Alfred      M    69.0   112.5
1     Alice      F    56.5    84.0
2   Barbara      F    65.3    98.0
3     Carol      F    62.8   102.5
4     Henry      M    63.5   102.5
5     James      M    57.3    83.0
6      Jane      F    59.8    84.5
7     Janet      F    62.5   112.5
8   Jeffrey      M    62.5    84.0
9      John      M    59.0    99.5
10    Joyce      F    51.3    50.5
11     Judy      F    64.3    90.0
12   Louise      F    56.3    77.0
13    Marry      F    66.5   112.0
14   Philip      M    72.0   150.0
15   Robert      M    64.8   128.0
16   Ronald      M    67.0   133.0
17   Thomas      M    57.5    85.0
18   Willam      M    66.5   112.0
In [58]:
# 删除 14 岁以下的女生, 就是筛选出 14岁以上或14岁以下的男生
print(students[(students['Age'] >= 14) | ((students['Age'] < 14) & (students['Gender'] == 'M')) ])
 
       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
5     James   12      M    57.3    83.0
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
11     Judy   14      F    64.3    90.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
 

5.3 修改

In [65]:
# 将 James 的身高修改为 1000
students.loc[students['Name'] == 'James', 'Height'] = 1000
print(students[students['Name'] == 'James'])
 
    Name  Age Gender  Height  Weight
5  James   12      M  1000.0    83.0
 

5.4 查询, 即筛选

5.5 聚合

In [68]:
print(students.groupby('Gender').count())
 
        Name  Age  Height  Weight
Gender                           
F          9    9       9       9
M         10   10      10      10
In [69]:
print(students.drop('Age', axis=1).groupby('Gender').mean())
 
            Height      Weight
Gender                        
F        60.588889   90.111111
M       158.180000  108.950000
In [71]:
print(students.groupby(['Gender', 'Age']).mean())
 
            Height  Weight
Gender Age                
F      11    51.30   50.50
       12    58.05   80.75
       13    60.90   91.00
       14    63.55   96.25
       15    64.50  112.25
M      11    57.50   85.00
       12   374.60  103.50
       13    62.50   84.00
       14    66.25  107.50
       15    66.75  122.50
       16    72.00  150.00
 

5.6 排序 sort_index 和 sort_values

In [72]:
# 先按年龄,再按身高排序
print(students.sort_values(by=['Age', 'Height']))
 
       Name  Age Gender  Height  Weight
10    Joyce   11      F    51.3    50.5
17   Thomas   11      M    57.5    85.0
12   Louise   12      F    56.3    77.0
9      John   12      M    59.0    99.5
6      Jane   12      F    59.8    84.5
15   Robert   12      M    64.8   128.0
5     James   12      M  1000.0    83.0
1     Alice   13      F    56.5    84.0
8   Jeffrey   13      M    62.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
11     Judy   14      F    64.3    90.0
0    Alfred   14      M    69.0   112.5
7     Janet   15      F    62.5   112.5
13    Marry   15      F    66.5   112.0
18   Willam   15      M    66.5   112.0
16   Ronald   15      M    67.0   133.0
14   Philip   16      M    72.0   150.0
In [73]:
# 降序
print(students.sort_values(by=['Age', 'Height'], ascending=False))
 
       Name  Age Gender  Height  Weight
14   Philip   16      M    72.0   150.0
16   Ronald   15      M    67.0   133.0
13    Marry   15      F    66.5   112.0
18   Willam   15      M    66.5   112.0
7     Janet   15      F    62.5   112.5
0    Alfred   14      M    69.0   112.5
11     Judy   14      F    64.3    90.0
4     Henry   14      M    63.5   102.5
3     Carol   14      F    62.8   102.5
2   Barbara   13      F    65.3    98.0
8   Jeffrey   13      M    62.5    84.0
1     Alice   13      F    56.5    84.0
5     James   12      M  1000.0    83.0
15   Robert   12      M    64.8   128.0
6      Jane   12      F    59.8    84.5
9      John   12      M    59.0    99.5
12   Louise   12      F    56.3    77.0
17   Thomas   11      M    57.5    85.0
10    Joyce   11      F    51.3    50.5
 

5.7 多表连接

In [74]:
dic2 = {'Name':['Alfred','Alice','Barbara','Carol','Henry','Jeffrey','Judy','Philip','Robert','Willam'],
        'Score':[88,76,89,67,79,90,92,86,73,77]}
score = pd.DataFrame(dic2)
students2 = pd.merge(students, score, on='Name')    # 默认内联
print(students2)
 
      Name  Age Gender  Height  Weight  Score
0   Alfred   14      M    69.0   112.5     88
1    Alice   13      F    56.5    84.0     76
2  Barbara   13      F    65.3    98.0     89
3    Carol   14      F    62.8   102.5     67
4    Henry   14      M    63.5   102.5     79
5  Jeffrey   13      M    62.5    84.0     90
6     Judy   14      F    64.3    90.0     92
7   Philip   16      M    72.0   150.0     86
8   Robert   12      M    64.8   128.0     73
9   Willam   15      M    66.5   112.0     77
In [79]:
students3 = pd.merge(students, score, on='Name', how='left')    # how 指定连接方式
print(students3)
 
       Name  Age Gender  Height  Weight  Score
0    Alfred   14      M    69.0   112.5   88.0
1     Alice   13      F    56.5    84.0   76.0
2   Barbara   13      F    65.3    98.0   89.0
3     Carol   14      F    62.8   102.5   67.0
4     Henry   14      M    63.5   102.5   79.0
5     James   12      M  1000.0    83.0    NaN
6      Jane   12      F    59.8    84.5    NaN
7     Janet   15      F    62.5   112.5    NaN
8   Jeffrey   13      M    62.5    84.0   90.0
9      John   12      M    59.0    99.5    NaN
10    Joyce   11      F    51.3    50.5    NaN
11     Judy   14      F    64.3    90.0   92.0
12   Louise   12      F    56.3    77.0    NaN
13    Marry   15      F    66.5   112.0    NaN
14   Philip   16      M    72.0   150.0   86.0
15   Robert   12      M    64.8   128.0   73.0
16   Ronald   15      M    67.0   133.0    NaN
17   Thomas   11      M    57.5    85.0    NaN
18   Willam   15      M    66.5   112.0   77.0
In [78]:
students4 = pd.merge(students, score, on='Name', how='right')
print(students4)
 
      Name  Age Gender  Height  Weight  Score
0   Alfred   14      M    69.0   112.5     88
1    Alice   13      F    56.5    84.0     76
2  Barbara   13      F    65.3    98.0     89
3    Carol   14      F    62.8   102.5     67
4    Henry   14      M    63.5   102.5     79
5  Jeffrey   13      M    62.5    84.0     90
6     Judy   14      F    64.3    90.0     92
7   Philip   16      M    72.0   150.0     86
8   Robert   12      M    64.8   128.0     73
9   Willam   15      M    66.5   112.0     77
 

6. Pandas 缺失值处理

 

6.1 删除法 dropna()

In [80]:
# 删除法
stu_score = students3['Score']
print(stu_score)
 
0     88.0
1     76.0
2     89.0
3     67.0
4     79.0
5      NaN
6      NaN
7      NaN
8     90.0
9      NaN
10     NaN
11    92.0
12     NaN
13     NaN
14    86.0
15    73.0
16     NaN
17     NaN
18    77.0
Name: Score, dtype: float64
In [81]:
print('缺失值记录数:', sum(pd.isnull(stu_score)))
# 删除缺失值的行
print(stu_score.dropna())
 
缺失值记录数: 9
0     88.0
1     76.0
2     89.0
3     67.0
4     79.0
8     90.0
11    92.0
14    86.0
15    73.0
18    77.0
Name: Score, dtype: float64
In [85]:
print(students3.dropna(axis=1))    # 删除含有 NaN 值的列
 
       Name  Age Gender  Height  Weight
0    Alfred   14      M    69.0   112.5
1     Alice   13      F    56.5    84.0
2   Barbara   13      F    65.3    98.0
3     Carol   14      F    62.8   102.5
4     Henry   14      M    63.5   102.5
5     James   12      M  1000.0    83.0
6      Jane   12      F    59.8    84.5
7     Janet   15      F    62.5   112.5
8   Jeffrey   13      M    62.5    84.0
9      John   12      M    59.0    99.5
10    Joyce   11      F    51.3    50.5
11     Judy   14      F    64.3    90.0
12   Louise   12      F    56.3    77.0
13    Marry   15      F    66.5   112.0
14   Philip   16      M    72.0   150.0
15   Robert   12      M    64.8   128.0
16   Ronald   15      M    67.0   133.0
17   Thomas   11      M    57.5    85.0
18   Willam   15      M    66.5   112.0
In [86]:
print(students3.dropna(axis=0))    # 删除含有 NaN 值的行
 
       Name  Age Gender  Height  Weight  Score
0    Alfred   14      M    69.0   112.5   88.0
1     Alice   13      F    56.5    84.0   76.0
2   Barbara   13      F    65.3    98.0   89.0
3     Carol   14      F    62.8   102.5   67.0
4     Henry   14      M    63.5   102.5   79.0
8   Jeffrey   13      M    62.5    84.0   90.0
11     Judy   14      F    64.3    90.0   92.0
14   Philip   16      M    72.0   150.0   86.0
15   Robert   12      M    64.8   128.0   73.0
18   Willam   15      M    66.5   112.0   77.0
 

6.2 补值法 fillna()

In [87]:
print(students3.fillna(0))    # 使用 0 来填补值为 NaN 的数据
 
       Name  Age Gender  Height  Weight  Score
0    Alfred   14      M    69.0   112.5   88.0
1     Alice   13      F    56.5    84.0   76.0
2   Barbara   13      F    65.3    98.0   89.0
3     Carol   14      F    62.8   102.5   67.0
4     Henry   14      M    63.5   102.5   79.0
5     James   12      M  1000.0    83.0    0.0
6      Jane   12      F    59.8    84.5    0.0
7     Janet   15      F    62.5   112.5    0.0
8   Jeffrey   13      M    62.5    84.0   90.0
9      John   12      M    59.0    99.5    0.0
10    Joyce   11      F    51.3    50.5    0.0
11     Judy   14      F    64.3    90.0   92.0
12   Louise   12      F    56.3    77.0    0.0
13    Marry   15      F    66.5   112.0    0.0
14   Philip   16      M    72.0   150.0   86.0
15   Robert   12      M    64.8   128.0   73.0
16   Ronald   15      M    67.0   133.0    0.0
17   Thomas   11      M    57.5    85.0    0.0
18   Willam   15      M    66.5   112.0   77.0
In [88]:
print(students3.fillna(method='ffill'))    # 使用前值填充
 
       Name  Age Gender  Height  Weight  Score
0    Alfred   14      M    69.0   112.5   88.0
1     Alice   13      F    56.5    84.0   76.0
2   Barbara   13      F    65.3    98.0   89.0
3     Carol   14      F    62.8   102.5   67.0
4     Henry   14      M    63.5   102.5   79.0
5     James   12      M  1000.0    83.0   79.0
6      Jane   12      F    59.8    84.5   79.0
7     Janet   15      F    62.5   112.5   79.0
8   Jeffrey   13      M    62.5    84.0   90.0
9      John   12      M    59.0    99.5   90.0
10    Joyce   11      F    51.3    50.5   90.0
11     Judy   14      F    64.3    90.0   92.0
12   Louise   12      F    56.3    77.0   92.0
13    Marry   15      F    66.5   112.0   92.0
14   Philip   16      M    72.0   150.0   86.0
15   Robert   12      M    64.8   128.0   73.0
16   Ronald   15      M    67.0   133.0   73.0
17   Thomas   11      M    57.5    85.0   73.0
18   Willam   15      M    66.5   112.0   77.0
In [89]:
print(students3.fillna(method='bfill'))    # 使用后值填充
 
       Name  Age Gender  Height  Weight  Score
0    Alfred   14      M    69.0   112.5   88.0
1     Alice   13      F    56.5    84.0   76.0
2   Barbara   13      F    65.3    98.0   89.0
3     Carol   14      F    62.8   102.5   67.0
4     Henry   14      M    63.5   102.5   79.0
5     James   12      M  1000.0    83.0   90.0
6      Jane   12      F    59.8    84.5   90.0
7     Janet   15      F    62.5   112.5   90.0
8   Jeffrey   13      M    62.5    84.0   90.0
9      John   12      M    59.0    99.5   92.0
10    Joyce   11      F    51.3    50.5   92.0
11     Judy   14      F    64.3    90.0   92.0
12   Louise   12      F    56.3    77.0   86.0
13    Marry   15      F    66.5   112.0   86.0
14   Philip   16      M    72.0   150.0   86.0
15   Robert   12      M    64.8   128.0   73.0
16   Ronald   15      M    67.0   133.0   77.0
17   Thomas   11      M    57.5    85.0   77.0
18   Willam   15      M    66.5   112.0   77.0
 

7. Pandas 数据透视

image.png

posted @ 2020-04-10 16:20  早起的虫儿去吃鸟  阅读(257)  评论(0编辑  收藏  举报