Python笔记 #15# Pandas: Missing Data
import pandas as pd import numpy as np import matplotlib.pyplot as plt dates = pd.date_range('20180116', periods=3) # 创建 16 17 18 等六个日期 df = pd.DataFrame(np.random.randn(3,4), index=dates, columns=list('ABCD')) # 这是二维的,类似于一个 df1 = df.reindex(index=dates[0:3], columns=list(df.columns) + ['E']) df1.loc[dates[0]:dates[1],'E'] = 1 # print(df1) # A B C D E # 2018-01-16 -0.183828 1.393147 1.816151 0.595298 1.0 # 2018-01-17 1.118642 -0.106566 -0.213438 1.510072 1.0 # 2018-01-18 0.705483 1.629647 -1.657045 0.428885 NaN # pandas 用 np.nan 来表示 missing data # print(df1.dropna(how='any')) # 所以这个方法不改变内部哦 ~ 但是会返回一个删除所有含 NaN 的行的 dataframe # print(df1) # A B C D E # 2018-01-16 0.866927 0.918359 0.908967 -0.888321 1.0 # 2018-01-17 -0.446272 0.534636 -0.160422 -0.157928 1.0 # A B C D E # 2018-01-16 0.866927 0.918359 0.908967 -0.888321 1.0 # 2018-01-17 -0.446272 0.534636 -0.160422 -0.157928 1.0 # 2018-01-18 1.095823 -1.300827 0.746324 -0.277497 NaN # 填充 NaN # print(df1.fillna(value=5)) # 这个也是不改变“本尊”的! # print(df1) # A B C D E # 2018-01-16 0.286535 -0.847836 -0.949535 -1.889351 1.0 # 2018-01-17 -0.530458 -0.871814 1.169275 0.337444 1.0 # 2018-01-18 -0.457999 -0.325463 0.439679 -0.104462 5.0 # A B C D E # 2018-01-16 0.286535 -0.847836 -0.949535 -1.889351 1.0 # 2018-01-17 -0.530458 -0.871814 1.169275 0.337444 1.0 # 2018-01-18 -0.457999 -0.325463 0.439679 -0.104462 NaN # To get the boolean mask where values are nan # print(pd.isna(df1)) # A B C D E # 2018-01-16 False False False False False # 2018-01-17 False False False False False # 2018-01-18 False False False False True