oracle 数据库表字段的缺失值统计 -- 基于python

 

  参考资料:python 连接oracle -- sqlalchemy及cx_Oracle的使用详解

  oracle指定表缺失值统计 -- 基于cx_Oracle

import pandas as pd
import cx_Oracle as orcl

# 批量查询数据缺失率
def missing_count(table_name, where_condition={}, **engine):
    #where 条件参数化, str或dict
    sql_tab_columns = "select column_name from user_tab_columns \
        where table_name = '{}'".format(table_name)

    db = ConnectOracle(**engine)
    #sql_select.encode('utf-8')
    columns = db.select_oracle(sql=sql_tab_columns)
    
    #生成select语句
    ss = ''
    for col in columns.COLUMN_NAME:
        ss += 'sum(decode({},null, 1, 0)) as {}, '.format(col, col)
    ss = ss[:-2]
    
    #生成where条件
    wh = ''
    if where_condition:
        wh += ' where '
        if type(where_condition)==str:
            wh += where_condition
        if type(where_condition)==dict:
            for key in where_condition.keys():
                if type(where_condition[key])!=str:
                    wh += ('t.' + str(key) + ' = ' +
                           str(where_condition[key]) + ' and ')
                else:
                    wh += ("t." + str(key) + " = '" +
                           str(where_condition[key]) + "' and ")
            wh = wh[:-4]
    
    #print(ss)
    sql_select =  '''select count(*) as counts, {}
                    from {} t {}
                    '''.format(ss, table_name, wh)
    
    #print(sql_select)
    res = db.select_oracle(sql=sql_select)
    return pd.Series(res.values.tolist()[0], index=res.columns)

  缺失值统计2 -- 基于sqlalchemy

import pandas as pd
#import cx_Oracle as orcl
from sqlalchemy import create_engine

# 批量查询数据缺失率
def missing_count(table_name, where_condition={}, **config):
    #where 条件参数化, str或dict
    
    #定义数据库连接
    #'oracle://qmcbrt:qmcbrt@10.85.31.20:1521/tqmcbdb'
    engine = 'oracle://{username}:{passwd}@{host}:{port}/{sid}'.format(**config)  #dbname -- 各版本语法不同
    db = create_engine(engine)
    #pd.read_sql_query(sql_tab_columns, db)
    #db.execute('truncate table {}'.format(ttb))
    
    #查询列名 -- 用于生成select项
    sql_tab_columns = "select column_name from user_tab_columns where table_name = '{}'".format(table_name)
    columns = pd.read_sql_query(sql_tab_columns, db)
     
    #生成select项
    ss = ''
    for col in columns.column_name:
        ss += 'sum(decode({}, null, 1, 0)) as {}, '.format(col, col)
    ss = ss[:-2]
     
    #生成where条件
    wh = ''
    if where_condition:
        wh += ' where '
        if type(where_condition)==str:
            wh += where_condition
        if type(where_condition)==dict:
            for key in where_condition.keys():
                if type(where_condition[key])!=str:
                    wh += ('t.' + str(key) + ' = ' +
                           str(where_condition[key]) + ' and ')
                else:
                    wh += ("t." + str(key) + " = '" +
                           str(where_condition[key]) + "' and ")
            wh = wh[:-4]
     
    #select语句
    sql_select =  '''select count(*) as counts, {} from {} t {} '''.format(ss, table_name, wh)
     
    #pd.Series(res.values.tolist()[0], index=res.columns)
    res = pd.read_sql_query(sql_select, db)
    return res.iloc[0,:]

  示例

config = {
        'username':'qmcb',
        'passwd':'qmcb',
        'host':'localhost',
        'port':'1521',
        'sid':'tqmcbdb'
        }
where_condition = {
                 'is_normal': 1,
                 'is_below_16': 0,
                 'is_xs': 0,
                 'is_cj': 0,
                 'is_dead': 0,
                 'AAE138_is_not_null': 0,
                 'is_dc': 0,
                 'is_px': 0
                 }
# 计算 QMCB_KM_2019_1_31_1 表的数据缺失数
missing_count('QMCB_KM_2019_1_31_1', where_condition, **config)

  

  

 

posted on 2019-02-26 16:31  iUpoint  阅读(932)  评论(0编辑  收藏  举报

导航