pandas数据清洗的一些操作
import pandas as pd # 显示所有列,行 # pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) # pd.set_option('max_colwidth',100) # 读取文件 df = pd.read_csv("z:/clear1.csv", encoding="utf-8", low_memory=True) print(df.head()) # 查询平方差(控制统计) nan_mean = df.isna().mean() nan_mean = nan_mean[nan_mean != 0].sort_values() print(nan_mean) # 日期格式 dt_series = pd.to_datetime(df["issue_d"]) df["year"] = dt_series.dt.year # 筛选 df = df.loc[df["year"] >= 2014] print(df["year"].value_counts()) # 删除 df.drop(["total_bal_ex_mort", 'tot_coll_amt', 'sub_grade'], axis=1, inplace=True) # 删除指定内容的行 df = df[~ df['issue_d'].str.contains('issue_d')] # 统计指定列 print(df["issue_d"].value_counts(sort=True)) print(df) # 保存数据 df.to_csv("z:/clear1.csv", index=False, encoding="utf-8")