pandas入门:处理缺失数据
# pandas使用浮点值NaN(Not a Number)表示浮点和非浮点数组中的缺失数据。
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
print(string_data)
'''
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
'''
print(string_data.isnull())
'''
0 False
1 False
2 True
3 False
dtype: bool
'''
# python内置的None值也会被当做Na处理
string_data[0] = None
print(string_data.isnull())
'''
0 True
1 False
2 True
3 False
dtype: bool
'''
- dropna: 根据各标签的值中是否存在缺失数据对轴标签进行过滤,可通过阀值调节对缺失值的容忍度
- fillna: 用指定值或插值方法(如ffill或bfill)填充缺失数据
- isnull: 返回一个含有布尔值的对象,这些布尔值博鳌是哪些值是缺失值NA
- notnull: isnull的否定式
滤除缺失数据
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
# 过滤缺失数据
data = Series([1,NA,3.5,NA,7])
print(data.dropna())
'''
0 1.0
2 3.5
4 7.0
dtype: float64
'''
print(data.notna())
'''
0 True
1 False
2 True
3 False
4 True
dtype: bool
'''
print(data[data.notna()])
'''
0 1.0
2 3.5
4 7.0
dtype: float64
'''
# dropna默认丢弃任何含有缺失值的行
data = DataFrame([[1,6.5,3],
[1,NA,NA],
[NA,NA,NA],
[NA,6.5,3]])
print(data)
'''
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
'''
print(data.dropna())
'''
0 1 2
0 1.0 6.5 3.0
'''
# 传入how='all'将只丢弃全部NA的行
print(data.dropna(how='all'))
'''
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
'''
# 要用这种方式丢失列,只需要传入axis=1即可
data[4]=NA
print(data)
'''
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN
'''
print(data.dropna(axis=1))
'''
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
'''
print(data.dropna(axis=1,how='all'))
'''
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
'''
df = DataFrame(np.random.randn(7,3))
df.iloc[:5,1] = NA
df.iloc[:3,2] = NA
print(df)
'''
0 1 2
0 1.034046 NaN NaN
1 0.205577 NaN NaN
2 0.669042 NaN NaN
3 -1.081377 NaN -0.850690
4 -0.129405 NaN 2.280089
5 -0.720506 0.719188 -0.698185
6 1.482302 1.589606 1.712550
'''
print(df.dropna(thresh=3))
'''
0 1 2
5 -0.720506 0.719188 -0.698185
6 1.482302 1.589606 1.712550
'''
填充缺失数据
print(df.fillna(0))
'''
0 1 2
0 -0.044841 0.000000 0.000000
1 -0.432459 0.000000 0.000000
2 0.036653 0.000000 0.000000
3 1.647238 0.000000 0.623209
4 0.395201 0.000000 0.216717
5 -1.792629 1.167120 1.424606
6 1.986463 0.691374 0.361006
'''
print(df.fillna({1:0.5,2:-1})) # 实现对不同列填充不同值
'''
0 1 2
0 0.704205 0.500000 -1.000000
1 -0.002524 0.500000 -1.000000
2 1.241561 0.500000 -1.000000
3 -0.340080 0.500000 0.038028
4 -0.616660 0.500000 -0.104324
5 -0.254113 1.020461 0.596161
6 -0.026914 -0.359409 -0.876534
'''
#fillna默认返回新对象,也可以对现有对象进行修改
df.fillna(0,inplace=True)
print(df)
'''
0 1 2
0 -0.187450 0.000000 0.000000
1 0.205142 0.000000 0.000000
2 -0.032737 0.000000 0.000000
3 -1.207977 0.000000 -0.079890
4 2.244593 0.000000 0.753733
5 -0.775953 0.553931 -0.137147
6 0.087671 0.426827 0.272821
'''
df = DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
print(df)
'''
0 1 2
0 -0.040906 0.507198 -0.466641
1 -0.231033 -0.741952 -0.443290
2 2.194688 NaN 0.672457
3 1.002863 NaN -0.338136
4 -0.429903 NaN NaN
5 -0.371691 NaN NaN
'''
print(df.fillna(method='ffill'))
'''
0 1 2
0 -0.117091 0.793242 -1.603526
1 0.911199 -0.062944 0.861507
2 1.529839 -0.062944 -0.206347
3 -0.180341 -0.062944 -0.121404
4 0.568776 -0.062944 -0.121404
5 1.673478 -0.062944 -0.121404
'''
print(df.fillna(method='ffill',limit=2))
'''
0 1 2
0 0.150973 -0.613426 -1.263605
1 -1.282189 0.420040 0.092557
2 0.919253 0.420040 0.754515
3 -1.570130 0.420040 -0.692602
4 -1.812111 NaN -0.692602
5 -0.568409 NaN -0.692602
'''
data = Series([1,NA,3.5,NA,7])
print(data.fillna(data.mean()))
'''
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
'''
本文来自博客园,作者:OTAKU_nicole,转载请注明原文链接:https://www.cnblogs.com/nicole-zhang/p/12955102.html