特征分箱

一、类别型特征

1)类别数在5个以下,可以直接根据类别来分箱 (binning_cate)

2)类别数在5个以上,建议做降基处理,再根据降基后的类别做分箱

def binning_cate(df, col, target):
    """
    df:数据集
    col:输入的特征
    target:好坏标记的字段名

    return:
    bin_df :特征的评估结果
    """

    total = df[target].count()
    bad = df[target].sum()
    good = total - bad
    d1 = pd.groupby([col], as_index=True)
    d2 = pd.DataFrame()
    d2['样本数'] = d1[target].count()
    d2['黑样本数'] = d1[target].sum()
    d2['白样本数'] = d2['样本数'] - d2['黑样本数']
    d2['逾期用户占比'] = d2['黑样本数'] / d2['样本数']
    d2['badattr'] = d2['黑样本数'] / bad
    d2['goodattr'] = d2['白样本数'] / good
    d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])
    d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']
    d2['IV'] = d2['bin_iv'].sum()

    bin_df = d2.reset_index()
    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df

二、数值型特征

1)离散型数值特征(特征value的变动幅度较小):

若特征value的非重复计数在5个以下,可以直接根据非重复计数值来分箱(binning_cate)

若特征value的非重复计数在5个以上,建议根据业务解释或者数据分布做自定义分箱(binning_self)

2)连续型数值特征(特征value的变动幅度较大):

可以用卡方分箱或自定义分箱。(binning_num,binning_self)

PS:一些特征用卡方分可能会报错,建议这些特征改为手动自定义分箱

def binning_self(df, col, target, cut=None, right_border=True):
    """
    df:数据集
    col:输入的特征
    target:好坏标记的字段名
    cut:总定义划分区间的list
    right_border:设定左开右闭、左闭右开

    return:
    bin_df :特征的评估结果
    """

    total = df[target].count()
    bad = df[target].sum()
    good = total - bad
    bucket = pd.cut(df[col], cut, right=right_border)
    d1 = df.groupby(bucket)
    d2 = pd.DataFrame()
    d2['样本数'] = d1[target].count()
    d2['黑样本数'] = d1[target].sum()
    d2['白样本数'] = d2['样本数'] - d2['黑样本数']
    d2['逾期用户占比'] = d2['黑样本数'] / d2['样本数']
    d2['badattr'] = d2['黑样本数'] / bad
    d2['goodattr'] = d2['白样本数'] / good
    d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])
    d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']
    d2['IV'] = d2['bin_iv'].sum()

    bin_df = d2.reset_index()
    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df
def binning_num(df, target, col, max_bin=None, min_binpct=None):
    """
    df:数据集
    col:输入的特征
    target:好坏标记的字段名
    max_bin:最大的分箱个数
    min_binpct:区间内样本所占总体的最小比

    return:
    bin_df :特征的评估结果
    """
    total = df[target].count()
    bad = df[target].sum()
    good = total - bad
    inf = float('inf')
    ninf = float('-inf')

    cut = ChiMerge(df, col, target, max_bin=max_bin, min_binpct=min_binpct)
    cut.insert(0, ninf)
    cut.append(inf)
    bucket = pd.cut(df[col], cut)
    d1 = df.groupby(bucket)
    d2 = pd.DataFrame()
    d2['样本数'] = d1[target].count()
    d2['黑样本数'] = d1[target].sum()
    d2['白样本数'] = d2['样本数'] - d2['黑样本数']
    d2['逾期用户占比'] = d2['黑样本数'] / d2['样本数']
    d2['badattr'] = d2['黑样本数'] / bad
    d2['goodattr'] = d2['白样本数'] / good
    d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])
    d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']
    d2['IV'] = d2['bin_iv'].sum()

    bin_df = d2.reset_index()
    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df

三、特征有缺失

1)缺失率在5%以下,可以先对缺失做填充处理再分箱(binning_num)

2)缺失率在5%以上,建议将缺失当作一个类别来分箱(binning_sparse_col)

def binning_sparse_col(df, target, col, max_bin=None, min_binpct=None, sparse_value=None):
    """
    df:数据集
    col:输入的特征
    target:好坏标记的字段名
    max_bin:最大的分箱个数
    min_binpct:区间内样本所占总体的最小比
    sparse_value:单独分为一箱的value值

    return:
    bin_df :特征的评估结果
    """

    total = df[target].count()
    bad = df[target].sum()
    good = total - bad

    # 对稀疏值0值或者缺失值单独分箱
    temp1 = df[df[col] == sparse_value]
    temp2 = df[~(df[col] == sparse_value)]

    bucket_sparse = pd.cut(temp1[col], [float('-inf'), sparse_value])
    group1 = temp1.groupby(bucket_sparse)
    bin_df1 = pd.DataFrame()
    bin_df1['样本数'] = group1[target].count()
    bin_df1['黑样本数'] = group1[target].sum()
    bin_df1['白样本数'] = bin_df1['样本数'] - bin_df1['黑样本数']
    bin_df1['逾期用户占比'] = bin_df1['黑样本数'] / bin_df1['样本数']
    bin_df1['badattr'] = bin_df1['黑样本数'] / bad
    bin_df1['goodattr'] = bin_df1['白样本数'] / good
    bin_df1['WOE'] = np.log(bin_df1['badattr'] / bin_df1['goodattr'])
    bin_df1['bin_iv'] = (bin_df1['badattr'] - bin_df1['goodattr']) * bin_df1['WOE']

    bin_df1 = bin_df1.reset_index()

    # 对剩余部分做卡方分箱
    cut = ChiMerge(temp2, col, target, max_bin=max_bin, min_binpct=min_binpct)
    cut.insert(0, sparse_value)
    cut.append(float('inf'))

    bucket = pd.cut(temp2[col], cut)
    group2 = temp2.groupby(bucket)
    bin_df2 = pd.DataFrame()
    bin_df2['样本数'] = group2[target].count()
    bin_df2['黑样本数'] = group2[target].sum()
    bin_df2['白样本数'] = bin_df2['样本数'] - bin_df2['黑样本数']
    bin_df2['逾期用户占比'] = bin_df2['黑样本数'] / bin_df2['样本数']
    bin_df2['badattr'] = bin_df2['黑样本数'] / bad
    bin_df2['goodattr'] = bin_df2['白样本数'] / good
    bin_df2['WOE'] = np.log(bin_df2['badattr'] / bin_df2['goodattr'])
    bin_df2['bin_iv'] = (bin_df2['badattr'] - bin_df2['goodattr']) * bin_df2['WOE']

    bin_df2 = bin_df2.reset_index()

    # 合并分箱结果
    bin_df = pd.concat([bin_df1, bin_df2], axis=0)
    bin_df['IV'] = bin_df['bin_iv'].sum().round(3)

    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df

四、稀疏特征分箱

建议将稀疏值(一般为0)单独分为一箱,剩下的值做卡方或者自定义分箱(binning_sparse_col)

五、附录

  • 指标评估函数

    def cal_ks(df, col, target):
        """
        df:数据集
        col:输入的特征
        target:好坏标记的字段名
    
        return:
        ks: KS值
        precision:准确率
        tpr:召回率
        fpr:打扰率
        """
    
        bad = df[target].sum()
        good = df[target].count() - bad
        value_list = list(df[col])
        label_list = list(df[target])
        value_count = df[col].nunique()
    
        items = sorted(zip(value_list, label_list), key=lambda x: x[0])
    
        value_bin = []
        ks_list = []
        if value_count <= 200:
            for i in sorted(set(value_list)):
                value_bin.append(i)
                label_bin = [x[1] for x in items if x[0] < i]
                badrate = sum(label_bin) / bad
                goodrate = (len(label_bin) - sum(label_bin)) / good
                ks = abs(goodrate - badrate)
                ks_list.append(ks)
        else:
            for i in range(1, 201):
                step = (max(value_list) - min(value_list)) / 200
                idx = min(value_list) + i * step
                value_bin.append(idx)
                label_bin = [x[1] for x in items if x[0] < idx]
                badrate = sum(label_bin) / bad
                goodrate = (len(label_bin) - sum(label_bin)) / good
                ks = abs(goodrate - badrate)
                ks_list.append(ks)
        ks = round(max(ks_list), 3)
    
        ks_value = [value_bin[i] for i, j in enumerate(ks_list) if j == max(ks_list)][0]
        precision = df[(df[col] <= ks_value) & (df[target] == 1)].shape[0] / df[df[col] <= ks_value].shape[0]
        tpr = df[(df[col] <= ks_value) & (df[target] == 1)].shape[0] / bad
        fpr = df[(df[col] <= ks_value) & (df[target] == 0)].shape[0] / good
    
        return ks, precision, tpr, fpr
    
  • 卡方分箱报错

    for col in tqdm(err_col):
        ninf = float('-inf')
        inf = float('inf')
        q_25 = df[col].quantile(0.25)
        q_50 = df[col].quantile(0.5)
        q_75 = df[col].quantile(0.75)
    
        cut = list(sorted(set([ninf, q_25, q_50, q_75, inf])))
    
        bin_df3 = binning_self(df, col, target, cut=cut, right_border=True)
    
  • 合并分箱结果

    cate_col = list(df.select_dtypes(include=['O']).columns)
    num_col = [x for x in list(df.select_dtypes(include=['int64', 'float64']).columns) if x != 'label']
    
    # 类别性变量分箱
    
    bin_cate_list = []
    for col in cate_col:
    	bin_cate = binning_cate(df, col, target)
    	bin_cate['rank'] = list(range(1, bin_cate.shape[0] + 1, 1))
    	bin_cate_list.append(bin_cate)
    
    # 数值型特征分箱
    num_col1 = [x for x in list(miss_df[miss_df.missing_pct > 0.05]['col']) if x in num_col]
    num_col2 = [x for x in list(miss_df[miss_df.missing_pct <= 0.05]['col']) if x in num_col]
    
    bin_num_list1 = []
    err_col1 = []
    for col in tqdm(num_col1):
    	try:
    		bin_df1 = binning_sparse_col(df, 'label', col, min_binpct=0.05, max_bin=4, sparse_value=-999)
    		bin_df1['rank'] = list(range(1, bin_df1.shape[0] + 1, 1))
    		bin_num_list1.append(bin_df1)
    	except (IndexError,ZeroDivisionError):
    		err_col1.append(col)
    	continue
    
    bin_num_list2 = []
    err_col2 = []
    for col in tqdm(num_col2):
    	try:
    		bin_df2 = binning_num(df, 'label', col, min_binpct=0.05, max_bin=5)
    		bin_df2['rank'] = list(range(1, bin_df2.shape[0] + 1, 1))
    		bin_num_list2.append(bin_df2)
    	except (IndexError,ZeroDivisionError):
    		err_col2.append(col)
    	continue
    
    # 卡方分箱报错的特征分箱
    err_col = err_col1 + err_col2
    bin_num_list3 = []
    if len(err_col) > 0:
    	for col in tqdm(err_col):
    		ninf = float('-inf')
    		inf = float('inf')
    		q_25 = df[col].quantile(0.25)
    		q_50 = df[col].quantile(0.5)
    		q_75 = df[col].quantile(0.75)
    
    		cut = list(sorted(set([ninf, q_25, q_50, q_75, inf])))
    
    		bin_df3 = binning_self(df, col, target, cut=cut, right_border=True)
    		bin_df3['rank'] = list(range(1, bin_df3.shape[0] + 1, 1))
    		bin_num_list3.append(bin_df3)
    
    bin_all_list = bin_num_list1 + bin_num_list2 + bin_num_list3 + bin_cate_list
    
    feature_result = pd.concat(bin_all_list, axis=0)
    feature_result = feature_result.sort_values(['IV', 'rank'], ascending=[False, True])
    feature_result = feature_result.drop(['rank'], axis=1)
    order_col = ['特征名', '分箱结果', '样本数', '黑样本数', '白样本数', '逾期用户占比', 'WOE', 'IV', '准确率', '召回率', '打扰率', 'KS']
    feature_result = feature_result[order_col]
    
posted @ 2022-01-23 23:00  wangfan000  阅读(340)  评论(0编辑  收藏  举报