oracle 数据库表字段的缺失值统计 -- 基于python
参考资料:python 连接oracle -- sqlalchemy及cx_Oracle的使用详解
oracle指定表缺失值统计 -- 基于cx_Oracle
import pandas as pd import cx_Oracle as orcl # 批量查询数据缺失率 def missing_count(table_name, where_condition={}, **engine): #where 条件参数化, str或dict sql_tab_columns = "select column_name from user_tab_columns \ where table_name = '{}'".format(table_name) db = ConnectOracle(**engine) #sql_select.encode('utf-8') columns = db.select_oracle(sql=sql_tab_columns) #生成select语句 ss = '' for col in columns.COLUMN_NAME: ss += 'sum(decode({},null, 1, 0)) as {}, '.format(col, col) ss = ss[:-2] #生成where条件 wh = '' if where_condition: wh += ' where ' if type(where_condition)==str: wh += where_condition if type(where_condition)==dict: for key in where_condition.keys(): if type(where_condition[key])!=str: wh += ('t.' + str(key) + ' = ' + str(where_condition[key]) + ' and ') else: wh += ("t." + str(key) + " = '" + str(where_condition[key]) + "' and ") wh = wh[:-4] #print(ss) sql_select = '''select count(*) as counts, {} from {} t {} '''.format(ss, table_name, wh) #print(sql_select) res = db.select_oracle(sql=sql_select) return pd.Series(res.values.tolist()[0], index=res.columns)
缺失值统计2 -- 基于sqlalchemy
import pandas as pd #import cx_Oracle as orcl from sqlalchemy import create_engine # 批量查询数据缺失率 def missing_count(table_name, where_condition={}, **config): #where 条件参数化, str或dict #定义数据库连接 #'oracle://qmcbrt:qmcbrt@10.85.31.20:1521/tqmcbdb' engine = 'oracle://{username}:{passwd}@{host}:{port}/{sid}'.format(**config) #dbname -- 各版本语法不同 db = create_engine(engine) #pd.read_sql_query(sql_tab_columns, db) #db.execute('truncate table {}'.format(ttb)) #查询列名 -- 用于生成select项 sql_tab_columns = "select column_name from user_tab_columns where table_name = '{}'".format(table_name) columns = pd.read_sql_query(sql_tab_columns, db) #生成select项 ss = '' for col in columns.column_name: ss += 'sum(decode({}, null, 1, 0)) as {}, '.format(col, col) ss = ss[:-2] #生成where条件 wh = '' if where_condition: wh += ' where ' if type(where_condition)==str: wh += where_condition if type(where_condition)==dict: for key in where_condition.keys(): if type(where_condition[key])!=str: wh += ('t.' + str(key) + ' = ' + str(where_condition[key]) + ' and ') else: wh += ("t." + str(key) + " = '" + str(where_condition[key]) + "' and ") wh = wh[:-4] #select语句 sql_select = '''select count(*) as counts, {} from {} t {} '''.format(ss, table_name, wh) #pd.Series(res.values.tolist()[0], index=res.columns) res = pd.read_sql_query(sql_select, db) return res.iloc[0,:]
示例
config = { 'username':'qmcb', 'passwd':'qmcb', 'host':'localhost', 'port':'1521', 'sid':'tqmcbdb' } where_condition = { 'is_normal': 1, 'is_below_16': 0, 'is_xs': 0, 'is_cj': 0, 'is_dead': 0, 'AAE138_is_not_null': 0, 'is_dc': 0, 'is_px': 0 } # 计算 QMCB_KM_2019_1_31_1 表的数据缺失数 missing_count('QMCB_KM_2019_1_31_1', where_condition, **config)