Python-Dataframe数据清洗之0值、缺失值、重复数据(以多列去重)、不符合累计递增趋势的异常数据(跳大值和跳小值,兼噪声值)清洗
起步者的苦苦挣扎......
方法一(单个ID去清洗):这个代码和上面的差不多,只是它进行的是单个递增趋势逐个进行清洗,,总的来说对于常见的异常情况有不错效果
缺点:效率比较低,半自动化,需要清洗多个ID的异常数据时,手动重复的动作比较多
import cx_Oracle import pandas as pd import numpy as np import time from sqlalchemy import create_engine from sqlalchemy.dialects.oracle import \ BFILE, BLOB, CHAR, CLOB, DATE, \ DOUBLE_PRECISION, FLOAT, INTERVAL, LONG, NCLOB, \ NUMBER, NVARCHAR, NVARCHAR2, RAW, TIMESTAMP, VARCHAR, \ VARCHAR2 import matplotlib.pyplot as plt import os os.environ['NLS_LANG']='SIMPLIFIED CHINESE_CHINA.UTF8' def connectToOracle(): conn=create_engine('oracle+cx_oracle://用户名:密码@IP:端口/实例名') #conn连接器 sql="select MONITOR_ID,COLLECT_DATE,COLLECT_TIME,VALUE_DATA,UPLOAD,LESSEE_ID " \ "from 表名 " \ "where MONITOR_ID='锦绣家园\LJLL1' and to_char(COLLECT_DATE,'yyyy-mm')='2020-04'" \ "order by COLLECT_TIME" try: data=pd.read_sql(sql,conn) print("连接成功") except: print("connecterror") conn.dispose() #关闭连接器 MDH_Data_Cleansing(data) def MDH_Data_Cleansing(data): print(data.isnull()) data.info() #查看数据基本信息 value_data = np.array(data['value_data']) print(data) # print(value_data) # 对data数组中不符合单调递增趋势的异常数据进行清洗 n = len(value_data) print(n) for i in range(1, n - 2): if (value_data1[i] < value_data1[i - 1] and value_data1[i - 4] != None and value_data1[i - 5] != None): m = min((value_data1[i - 1] - value_data1[i - 2]), (value_data1[i - 2] - value_data1[i - 3]), \ (value_data1[i - 3] - value_data1[i - 4]), (value_data1[i - 4] - value_data1[i - 5])) value_data1[i] = value_data1[i - 1] + m if (value_data1[i] > value_data1[i - 1] and value_data1[i] > value_data1[i + 1] and value_data1[i] > value_data1[i + 2] \ and value_data1[i - 4] != None and value_data1[i - 5] != None): m = min((value_data1[i - 1] - value_data1[i - 2]), (value_data1[i - 2] - value_data1[i - 3]), \ (value_data1[i - 3] - value_data1[i - 4]), (value_data1[i - 4] - value_data1[i - 5])) value_data1[i] = value_data1[i - 1] + m i += 1 print(1) data.drop('value_data', axis=1, inplace=True) # 删除列value_data data['value_data']=value_data #清洗后列值替换 print(data) data.dropna(subset=['value_data'], axis=0, inplace=True) # 删除列VALUE_DATA存在缺失值的所在行 #箱线图分析法检测噪声值 print(data['value_data'].describe(percentiles=[.25, .75], include=['object', 'float64'])) # describe distance_data=data['value_data'].quantile(0.75)-data['value_data'].quantile(0.25) #四分位距,即箱 top_data=data['value_data'].quantile(0.75)+1.5*distance_data #箱线的上限 bottom_data=data['value_data'].quantile(0.25)-1.5*distance_data #箱线的下限 count_data=(data['value_data']>top_data) | (data['value_data']<bottom_data) #噪声值 index_toarray = np.array(data[count_data == True].index) # 取出异常值索引 print("正常值 vs 噪声值个数:\n",count_data.value_counts(),"\n噪声值的行索引:",index_toarray) #打印噪声值数 #噪声值处理 data.loc[index_toarray,'value_data']=data['value_data'].median().round(3) #中位数替换 print(data) ''' data.drop_duplicates(subset=['new_column'],keep='first',inplace=True) #根据新列去进行去重 index_toarray = np.array(data_demo[count_data == False].index) # 取出异常值 索引 print("正常值(True) vs 噪声值个数(False):", count_data.value_counts(), "噪声值的行索引:", index_toarray) # 打印噪声值数和索引 data.dropna(subset=['value_data'], axis=0, inplace=True) # 删除列Dataframe数据类型中value_data存在缺失值的所在行,以保证数据的可靠性 ''' data.drop_duplicates(subset=['monitor_id','collect_date','value_data'],keep='first',inplace=True) #根据多列进行去重 MDH_Dataframe_toOracle(data) def mapping_data_types(data): #实现Dataframe字段的类型转换(必转,否则就是给自己挖坑,不要问我是怎么知道的) dtypedict = {} for i, j in zip(data.columns, data.dtypes): if "object" in str(j): dtypedict.update({i: VARCHAR(256)}) if "int" in str(j): dtypedict.update({i: NUMBER(12,2)}) if "date" in str(j): dtypedict.update({i: DATE(19)}) return dtypedict def MDH_Dataframe_toOracle(data): #将Dataframe数据写入ORACLE数据库 from sqlalchemy import types, create_engine conn=create_engine('oracle+cx_oracle://用户名:密码@IP:端口/实例名',encoding='utf-8',echo=True) #连接器 from sqlalchemy.dialects.oracle import \ BFILE, BLOB, CHAR, CLOB, DATE, \ DOUBLE_PRECISION, FLOAT, INTERVAL, LONG, NCLOB, \ NUMBER, NVARCHAR, NVARCHAR2, RAW, TIMESTAMP, VARCHAR, \ VARCHAR2 #print(conn) dtypedict = mapping_data_types(data) #映射数据类型 tableName='monitor_data_his_cleanaftertb' data.to_sql(tableName,con=conn,if_exists='append',dtype=dtypedict,chunksize=None,index=False) conn.dispose() if __name__ == '__main__': pd.set_option('display.max_columns', None) # 控制台完整显示列 pd.set_option('display.max_rows', 1000) # 行数 pd.set_option('display.width',500) # 列数 pd.set_option('max.colwidth',100) #列宽 connectToOracle() MDH_Dataframe_toOracle(data)
方法二(多个ID去清洗):该算法存在缺陷,仅适用于常见的数据异常情况,对于多个连续并且任意的异常数据或噪声值无法处理。利用了普通循环判断+分箱法进行数据检测和平滑,数据量越大,预期的清洗效果越好。。。
import cx_Oracle import pandas as pd import numpy as np from sqlalchemy import create_engine from sqlalchemy.dialects.oracle import \ BFILE, BLOB, CHAR, CLOB, DATE, \ DOUBLE_PRECISION, FLOAT, INTERVAL, LONG, NCLOB, \ NUMBER, NVARCHAR, NVARCHAR2, RAW, TIMESTAMP, VARCHAR, \ VARCHAR2 import matplotlib.pyplot as plt import time import os os.environ['NLS_LANG']='SIMPLIFIED CHINESE_CHINA.UTF8' #连接数据库、获取数据 def connectToOracle(): conn=create_engine('oracle+cx_oracle://用户名:密码@IP:端口/实例名') #conn连接器 # #测试sql # sql="select MONITOR_ID,COLLECT_DATE,COLLECT_TIME,VALUE_DATA,UPLOAD,LESSEE_ID,rk " \ # "from (" \ # "select a.*,row_number() over(partition by MONITOR_ID order by COLLECT_TIME) rk " \ # "from MONITOR_DATA_HIS_clearn_test a " \ # ")" \ # "where rk>=0" sql="select " \ "datata.MONITOR_ID," \ "datata.COLLECT_DATE," \ "datata.COLLECT_TIME," \ "datata.VALUE_DATA," \ "datata.UPLOAD," \ "datata.LESSEE_ID," \ "datata.rk" \ " from (" \ "select MONITOR_ID, " \ "to_date(substr(to_char(COLLECT_DATE,'yyyy-mm-dd hh:mi:ss'),0,10),'yyyy-mm-dd') COLLECT_DATE," \ "COLLECT_DATE COLLECT_TIME," \ "round(VALUE_DATA,2) VALUE_DATA," \ "UPLOAD," \ "LESSEE_ID," \ "row_number() over(partition by MONITOR_ID order by COLLECT_DATE) rk " \ "from 表名" \ "where MONITOR_ID in(" \ "select MONITOR_ID from (" \ "select b.MONITOR_ID MONITOR_ID " \ "from (" \ "select distinct MONITOR_ID from monitor_data_relation t1 join monitor_item t2 " \ "on t1.item_id=t2.item_id " \ "where item_name!='负累计流量' and item_name!='反向累计流量' and item_name like '%累计流量%'" \ ") a " \ "left join" \ "(SELECT distinct MONITOR_ID FROM 表名) b " \ "on a.MONITOR_ID=b.MONITOR_ID" \ ") where MONITOR_ID is not null " \ ") " \ ") datata " \ "where rk>=1 and VALUE_DATA!=0" data=pd.read_sql(sql,conn) print("连接成功") conn.dispose() #关闭连接器 MDH_Data_Cleansing(data) #数据清洗 def MDH_Data_Cleansing(data): print("待清洗数据:\n", data) #print(data.isnull()) rk=np.array(data['rk']) #声明所有递增趋势序号的数组,rk数据模板:[1,2,3,1,2,1,2,3,4···] #print(rk) m=len(rk) #获取每个递增趋势数据的头节点的行索引,存进节点数组arraySwitch arraySwitch = [] for j in range(1,m-1): if(rk[j] == 1): arraySwitch.append(j-1) arraySwitch.append(j) j += 1 arraySwitch.insert(0,0) arraySwitch.append(m-1) #追加最后一个递增趋势数据的尾节点的行索引(即所有递增趋势的数据的总数-1) num=len(arraySwitch) print("节点数组个数num=",num) print("节点数组arraySwitch=",arraySwitch) # 对data数组中不符合单调递增趋势的异常数据进行清洗 dataResult = pd.DataFrame(columns=['value_data']) # 用于存储最终清洗和降噪完成,合并后的Dframe数据 s = 0 while s < num - 1: value_data1 = np.array(data.loc[arraySwitch[s]:arraySwitch[s + 1], 'value_data']) # 循环逐一取出递增趋势数据 # value_data2=value_data1[0:-1] #删除每一个取出的递增趋势数据的脏数据(每个递增趋势数据的最后一个值,但最后一个递增趋势数据是没有脏数据的) # print(value_data1) n = len(value_data1) # 获取单个递增趋势数据的个数 # 清洗 for i in range(1, n - 2): if (value_data1[i] < value_data1[i - 1] and value_data1[i - 4] != None and value_data1[i - 5] != None): m = min((value_data1[i - 1] - value_data1[i - 2]),(value_data1[i - 2] - value_data1[i - 3]), \ (value_data1[i - 3] - value_data1[i - 4]),(value_data1[i - 4] - value_data1[i - 5])) value_data1[i] = value_data1[i - 1] + m if (value_data1[i] > value_data1[i - 1] and value_data1[i] > value_data1[i + 1] and value_data1[i] > value_data1[i + 2] \ and value_data1[i - 4] != None and value_data1[i - 5] != None): m = min((value_data1[i - 1] - value_data1[i - 2]), (value_data1[i - 2] - value_data1[i - 3]), \ (value_data1[i - 3] - value_data1[i - 4]),(value_data1[i - 4] - value_data1[i - 5])) value_data1[i] = value_data1[i - 1] + m i += 1 data_demo = pd.DataFrame(value_data1, columns=['value_data']) print("\n\n该递增趋势数据,清洗后/降噪前:\n", data_demo) # 箱线图分析法检测噪声值 print("递增趋势数据数据量为", n, ",描述信息:", "\n", data_demo['value_data'].describe(percentiles=[.25, .75], include=['object', 'float64'])) # describe distance_data = data_demo['value_data'].quantile(0.75) - data_demo['value_data'].quantile(0.25) # 四分位距,即箱 top_data = data_demo['value_data'].quantile(0.75) + 1.5 * distance_data # 箱线的上限 bottom_data = data_demo['value_data'].quantile(0.25) - 1.5 * distance_data # 箱线的下限 count_data = ((data_demo['value_data'] >= bottom_data) | (data_demo['value_data'] <= top_data)) # 噪声值 index_toarray = np.array(data_demo[count_data == False].index) # 取出异常值索引 print("正常值(True) vs 噪声值个数(False):\n", count_data.value_counts(), "噪声值的行索引:", index_toarray) # 打印噪声值数和索引 # 噪声值处理 data_demo.loc[index_toarray, 'value_data'] = data_demo['value_data'].median().round(3) # 中位数替换 print("降噪后:\n", data_demo) dataResult = dataResult.append(data_demo) # 循环逐一合并递增趋势数组,存储于Dataframe表dataResult s += 2 ''' data.drop_duplicates(subset=['new_column'],keep='first',inplace=True) #根据新列去进行去重 data.dropna(subset=['value_data'], axis=0, inplace=True) # 删除列Dataframe数据类型中value_data存在缺失值的所在行,以保证数据的可靠性 ''' dataResult.index=range(len(dataResult)) #重建Dataframe索引 data.drop('rk',axis=1,inplace=True) #删除列rk #print(data) data.drop('value_data', axis=1, inplace=True) # 删除列value_data #print(data) data=pd.concat([data,dataResult],axis=1) #合并两个Dataframe #print(data) data.dropna(subset=['value_data'], axis=0, inplace=True) # 删除列value_data存在缺失值的所在行 #print(data) data.drop_duplicates(subset=['monitor_id', 'collect_date', 'value_data'], keep='first', inplace=True) # 根据多列进行去重 print(data) MDH_Dataframe_toOracle(data) #转类型 def mapping_data_types(data): #实现Dataframe字段的类型转换(必转,否则就是给自己挖坑,不要问我是怎么知道的) dtypedict = {} for i, j in zip(data.columns, data.dtypes): if "object" in str(j): dtypedict.update({i: VARCHAR(256)}) if "int" in str(j): dtypedict.update({i: NUMBER(12,2)}) if "date" in str(j): dtypedict.update({i: DATE(19)}) return dtypedict #写入数据库 def MDH_Dataframe_toOracle(data): #将Dataframe数据写入ORACLE数据库 from sqlalchemy import types, create_engine from sqlalchemy.dialects.oracle import \ BFILE, BLOB, CHAR, CLOB, DATE, \ DOUBLE_PRECISION, FLOAT, INTERVAL, LONG, NCLOB, \ NUMBER, NVARCHAR, NVARCHAR2, RAW, TIMESTAMP, VARCHAR, \ VARCHAR2 conn = create_engine('oracle+cx_oracle://用户名:密码@IP:端口/实例名', encoding='utf-8',echo=True) # 连接器 #print(conn) dtypedict = mapping_data_types(data) #调用转类型方法mapping_data_types,映射数据类型 tableName='monitor_data_his_cleanaftertb' data.to_sql(tableName,con=conn,if_exists='append',dtype=dtypedict,chunksize=None,index=False) conn.dispose() if __name__ == '__main__': pd.set_option('display.max_columns', None) # 控制台完整显示列 pd.set_option('display.max_rows', 100) # 行数 pd.set_option('display.width',500) # 列数 pd.set_option('max.colwidth',100) #列宽 time_start = time.time() connectToOracle() time_end = time.time() print("执行时间(分钟):", (time_end - time_start) / 60)
-----------------------------------------------------转载需备注博主名和原创网址!!!------------------------------------------------------