数据分析中的变量分箱——德国信贷数据集（variable bin in data analysis -German credit datasets）

最近看了一本《Python金融大数据风控建模实战：基于机器学习》（机械工业出版社）这本书，看了其中第6章：变量分箱方法内容，总结了主要内容以及做了代码详解，分享给大家。
一、主要知识点：
1. 变量分箱是一种特征工程方法，意在增强变量的可解释性与预测能力。变量分箱方法主要用于连续变量，对于变量取值较稀疏的离散变量也应该进行分箱处理。
2. 变量分箱的好处：
（1）降低异常值的影响，增加模型的稳定性。
（2）缺失值作为特殊变量参与分箱，减少缺失值填补的不确定性。
（3）增加变量的可解释性。
（4）增加变量的非线性。
（5）增加模型的预测效果。
3. 变量分箱的局限性：
（1）同一箱内的样本具有同质性。（2）需要专家经验支持。
4. 变量分箱的注意事项：
（1）分箱结果不宜过多。（2）分箱结果不宜过少。（3）分箱后单调性的要求。
5. 变量分箱流程
二、代码实现
数据的使用还是德国信贷数据集，具体数据集介绍和获取方法请看数据清洗与预处理代码详解——德国信贷数据集（data cleaning and preprocessing - German credit datasets）
  1 # -*- coding: utf-8 -*-
  2 """
  3 第6章：变量分箱方法
  4     1: Chi-merge(卡方分箱)
  5     2: IV(最优IV值分箱)
  6     3: 信息熵(基于树的分箱)
  7 """
  8 import os
  9 import pandas as pd
 10 import numpy as np
 11 from sklearn.model_selection import train_test_split
 12 import warnings
 13 warnings.filterwarnings("ignore")  # 忽略警告
 14 
 15 
 16 def data_read(data_path, file_name):
 17     df = pd.read_csv(os.path.join(data_path, file_name),
 18                      delim_whitespace=True,
 19                      header=None)
 20     # 变量重命名
 21     columns = [
 22         'status_account', 'duration', 'credit_history', 'purpose', 'amount',
 23         'svaing_account', 'present_emp', 'income_rate', 'personal_status',
 24         'other_debtors', 'residence_info', 'property', 'age', 'inst_plans',
 25         'housing', 'num_credits', 'job', 'dependents', 'telephone',
 26         'foreign_worker', 'target'
 27     ]
 28     df.columns = columns
 29     # 将标签变量由状态1,2转为0,1;0表示好用户，1表示坏用户
 30     df.target = df.target - 1
 31     # 数据分为data_train和 data_test两部分，训练集用于得到编码函数，验证集用已知的编码规则对验证集编码
 32     data_train, data_test = train_test_split(df,
 33                                              test_size=0.2,
 34                                              random_state=0,
 35                                              stratify=df.target)
 36     return data_train, data_test
 37 
 38 
 39 def cal_advantage(temp, piont, method, flag='sel'):
 40     """
 41     计算当前切分点下的指标值
 42     # 参数
 43         temp: 上一步的分箱结果，pandas dataframe
 44         piont: 切分点，以此来划分分箱
 45         method: 分箱方法选择，1:chi-merge , 2:IV值, 3:信息熵
 46     """
 47     #    temp = binDS
 48     if flag == 'sel':
 49         # 用于最优切分点选择，这里只是二叉树，即二分
 50         bin_num = 2
 51         # np.empty 依给定的shape, 和数据类型 dtype,  返回一个一维或者多维数组，数组的元素不为空，为随机产生的数据。
 52         good_bad_matrix = np.empty((bin_num, 3))
 53         for ii in range(bin_num):
 54             if ii == 0:
 55                 df_temp_1 = temp[temp['bin_raw'] <= piont]
 56             else:
 57                 df_temp_1 = temp[temp['bin_raw'] > piont]
 58             # 计算每个箱内的好坏样本书
 59             good_bad_matrix[ii][0] = df_temp_1['good'].sum()
 60             good_bad_matrix[ii][1] = df_temp_1['bad'].sum()
 61             good_bad_matrix[ii][2] = df_temp_1['total'].sum()
 62 
 63     elif flag == 'gain':
 64         # 用于计算本次分箱后的指标结果，即分箱数，每增加一个，就要算一下当前分箱下的指标结果
 65         bin_num = temp['bin'].max()
 66         good_bad_matrix = np.empty((bin_num, 3))
 67         for ii in range(bin_num):
 68             df_temp_1 = temp[temp['bin'] == (ii + 1)]
 69             good_bad_matrix[ii][0] = df_temp_1['good'].sum()
 70             good_bad_matrix[ii][1] = df_temp_1['bad'].sum()
 71             good_bad_matrix[ii][2] = df_temp_1['total'].sum()
 72 
 73     # 计算总样本中的好坏样本
 74     total_matrix = np.empty(3)
 75     # sum（）函数用于获取所请求轴的值之和。
 76     total_matrix[0] = temp.good.sum()
 77     total_matrix[1] = temp.bad.sum()
 78     total_matrix[2] = temp.total.sum()
 79 
 80     # Chi-merger分箱
 81     if method == 1:
 82         X2 = 0
 83         # i 是区间的信息
 84         for i in range(bin_num):
 85             # j=0 表示好样本, j=1 表示坏样本
 86             for j in range(2):
 87                 # 期望值 好(坏)样本/总样本 * 该区间的样本总数
 88                 expect = (total_matrix[j] / total_matrix[2]) * good_bad_matrix[i][2]
 89                 # 计算实际值和期望值的差异距离的平方/该样本的期望值
 90                 X2 = X2 + (good_bad_matrix[i][j] - expect)**2 / expect
 91         M_value = X2
 92     # IV分箱
 93     elif method == 2:
 94         if pd.isnull(total_matrix[0]) or pd.isnull(total_matrix[1]) or total_matrix[0] == 0 or total_matrix[1] == 0:
 95             M_value = np.NaN
 96         else:
 97             IV = 0
 98             for i in range(bin_num):
 99                 # 坏好比
100                 weight = good_bad_matrix[i][1] / total_matrix[1] - good_bad_matrix[i][0] / total_matrix[0]
101                 # 本来对照公式觉得这里出现问题，后来化简下方程，发现是对的
102                 IV = IV + weight * np.log((good_bad_matrix[i][1] * total_matrix[0]) / (good_bad_matrix[i][0] * total_matrix[1]))
103             M_value = IV
104     # 信息熵分箱
105     elif method == 3:
106         # 总的信息熵
107         entropy_total = 0
108         for j in range(2):
109             weight = (total_matrix[j] / total_matrix[2])
110             entropy_total = entropy_total - weight * (np.log(weight))
111 
112         # 计算条件熵
113         entropy_cond = 0
114         for i in range(bin_num):
115             entropy_temp = 0
116             for j in range(2):
117                 entropy_temp = entropy_temp - \
118                     ((good_bad_matrix[i][j] / good_bad_matrix[i][2]) * np.log(good_bad_matrix[i][j] / good_bad_matrix[i][2]))
119             entropy_cond = entropy_cond + good_bad_matrix[i][2] / total_matrix[2] * entropy_temp
120 
121         # 计算归一化信息增益
122         M_value = 1 - (entropy_cond / entropy_total)
123     # Best-Ks分箱
124     else:
125         pass
126     return M_value
127 
128 
129 def best_split(df_temp0, method, bin_num):
130     """
131         在每个候选集中寻找切分点，完成一次分裂。
132         select_split_point函数的中间过程函数
133         # 参数
134             df_temp0: 上一次分箱后的结果，pandas dataframe
135             method: 分箱方法选择，1:chi-merge , 2:IV值, 3:信息熵
136             bin_num: 分箱编号，在不同编号的分箱结果中继续二分
137         # 返回值
138             返回在本次分箱标号内的最有切分结果， pandas dataframe
139     """
140     #    df_temp0 = df_temp
141     #    bin_num = 1
142     df_temp0 = df_temp0.sort_values(by=['bin', 'bad_rate'])
143     piont_len = len(df_temp0[df_temp0['bin'] == bin_num])  # 候选集的长度
144     bestValue = 0
145     bestI = 1
146     li = []
147     # 以候选集的每个切分点做分隔，计算指标值
148     for i in range(1, piont_len):
149         # 计算指标值
150         value = cal_advantage(df_temp0, i, method, flag='sel')
151         li.append(value)
152         # 要的是大的值
153         if bestValue < value:
154             bestValue = value
155             bestI = i
156     # print("beasValue = ", bestValue)
157     # create new var split according to bestI，运行后多了一个维度
158     df_temp0['split'] = np.where(df_temp0['bin_raw'] <= bestI, 1, 0)
159     # dataFrame.drop用于删除指定的行列
160     df_temp0 = df_temp0.drop('bin_raw', axis=1)
161     # 重新排序，默认是升序排序
162     newbinDS = df_temp0.sort_values(by=['split', 'bad_rate'])
163     # rebuild var i
164     newbinDS_0 = newbinDS[newbinDS['split'] == 0]
165     newbinDS_1 = newbinDS[newbinDS['split'] == 1]
166     newbinDS_0 = newbinDS_0.copy()
167     newbinDS_1 = newbinDS_1.copy()
168     newbinDS_0['bin_raw'] = range(1, len(newbinDS_0) + 1)
169     newbinDS_1['bin_raw'] = range(1, len(newbinDS_1) + 1)
170     newbinDS = pd.concat([newbinDS_0, newbinDS_1], axis=0)
171     return newbinDS
172 
173 
174 def select_split_point(temp_bin, method):
175     """
176     二叉树分割方式，从候选者中挑选每次的最优切分点，与切分后的指标计算cont_var_bin函数的中间过程函数，
177     # 参数
178         temp_bin: 分箱后的结果 pandas dataframe
179         method:分箱方法选择，1:chi-merge , 2:IV值, 3:信息熵
180     # 返回值
181         新的分箱结果  pandas dataframe
182     """
183     #    temp_bin = df_temp_all
184     # sort_values()函数原理类似于SQL中的order by，可以将数据集依照某个字段中的数据进行排序
185     # 参数by指定列名(axis=0或’index’)或索引值(axis=1或’columns’)
186     temp_bin = temp_bin.sort_values(by=['bin', 'bad_rate'])
187     # 得到当前的最大的分箱值
188     max_num = max(temp_bin['bin'])
189     #    temp_binC = dict()
190     #    m = dict()
191     #    # 不同箱内的数据取出来
192     #    for i in range(1, max_num + 1):
193     #        temp_binC[i] = temp_bin[temp_bin['bin'] == i]
194     #        m[i] = len(temp_binC[i])
195     temp_main = dict()
196     bin_i_value = []
197     for i in range(1, max_num + 1):
198         # 得到这一类别的数据
199         df_temp = temp_bin[temp_bin['bin'] == i]
200         # 如果这一类别的数据大于1
201         if df_temp.shape[0] > 1:
202             # bin=i的做分裂
203             temp_split = best_split(df_temp, method, i)
204             # 完成一次分箱，更新bin的之 np.where(condition, x, y) 满足条件condition，输出x,否则输出y
205             # 这里把 ['bin'] 这一列本来都是相同的值区分开来
206             temp_split['bin'] = np.where(temp_split['split'] == 1, max_num + 1, temp_split['bin'])
207             # 取出bin!=i合并为新租
208             temp_main[i] = temp_bin[temp_bin['bin'] != i]
209             # 这里 temp_split 比 temp_main[i] 多了一列变量，合并的时候，不存在的值为 NaN
210             temp_main[i] = pd.concat([temp_main[i], temp_split], axis=0, sort=False)
211             # 计算新分组的指标值
212             value = cal_advantage(temp_main[i], 0, method, flag='gain')
213             newdata = [i, value]
214             bin_i_value.append(newdata)
215     # 最终只选择一个 df_temp.shape[0]>1 的分类分组结果
216     # find maxinum of value bintoSplit
217     bin_i_value.sort(key=lambda x: x[1], reverse=True)
218     # binNum = temp_all_Vals['BinToSplit']
219     binNum = bin_i_value[0][0]
220     newBins = temp_main[binNum].drop('split', axis=1)
221     return newBins.sort_values(by=['bin', 'bad_rate']), round(bin_i_value[0][1], 4)
222 
223 
224 def init_equal_bin(x, bin_rate):
225     """
226         初始化等距分组，cont_var_bin函数的中间过程函数
227         # 参数
228             x:要分组的变量值，pandas series
229             bin_rate：比例值1/bin_rate
230         # 返回值
231             返回初始化分箱结果，pandas dataframe
232     """
233     # 异常值剔除，只考虑90%没的最大值与最小值，边界与-inf或inf分为一组
234     # np.percentile 是 计算一组数的分位数值
235     # print("np.percentile(x, 95) = ", np.percentile(x, 95))
236     if len(x[x > np.percentile(x, 95)]) > 0 and len(np.unique(x)) >= 30:
237         var_up = min(x[x > np.percentile(x, 95)])
238     else:
239         var_up = max(x)
240     # print("var_up = ", var_up)
241     # print("np.percentile(x, 5) = ", np.percentile(x, 5))
242     if len(x[x < np.percentile(x, 5)]) > 0:
243         var_low = max(x[x < np.percentile(x, 5)])
244     else:
245         var_low = min(x)
246     # print("var_low = ", var_low)
247 
248     # 初始化分组个数
249     bin_num = int(1 / bin_rate)
250     # 分箱间隔
251     dist_bin = (var_up - var_low) / bin_num
252     bin_up = []
253     bin_low = []
254     for i in range(1, bin_num + 1):
255         if i == 1:
256             bin_up.append(var_low + i * dist_bin)
257             bin_low.append(-np.inf)
258         elif i == bin_num:
259             bin_up.append(np.inf)
260             bin_low.append(var_low + (i - 1) * dist_bin)
261         else:
262             bin_up.append(var_low + i * dist_bin)
263             bin_low.append(var_low + (i - 1) * dist_bin)
264     result = pd.DataFrame({'bin_up': bin_up, 'bin_low': bin_low})
265     # 设置result数据的索引名
266     result.index.name = 'bin_num'
267     return result
268 
269 
270 def limit_min_sample(temp_cont, bin_min_num_0):
271     """
272         分箱约束条件：每个箱内的样本数不能小于bin_min_num_0，cont_var_bin函数的中间过程函数
273         # 参数
274             temp_cont: 初始化分箱后的结果 pandas dataframe
275             bin_min_num_0:每组内的最小样本限制
276         # 返回值
277             合并后的分箱结果，pandas dataframe
278     """
279     # print("合并前 temp_cont.shape = ", temp_cont.shape)
280     # print("temp_cont.index.max() = ", temp_cont.index.max())
281     for i in temp_cont.index:
282         # 获取某一行的数据
283         rowdata = temp_cont.loc[i, :]
284         # print("rowdata = ", rowdata)
285         if i == temp_cont.index.max():
286             # 如果是最后一个箱就，取倒数第二个值
287             ix = temp_cont[temp_cont.index < i].index.max()
288         else:
289             # 否则就取大于i的最小的分箱值
290             ix = temp_cont[temp_cont.index > i].index.min()
291         # print("------------------------------")
292         # print("i = ", i)
293         # print("ix = ", ix)
294         # print("rowdata = ", rowdata)
295         # 如果0, 1, total项中样本的数量小于20则进行合并
296         if rowdata['total'] <= bin_min_num_0:
297             # 与相邻的bin合并，即把temp_cont.loc[i]的值和temp_cont.loc[ix]的值合并
298             temp_cont.loc[ix, 'bad'] = temp_cont.loc[ix, 'bad'] + rowdata['bad']
299             temp_cont.loc[ix, 'good'] = temp_cont.loc[ix, 'good'] + rowdata['good']
300             temp_cont.loc[ix, 'total'] = temp_cont.loc[ix, 'total'] + rowdata['total']
301             # 把低限制值保留下来
302             if i < temp_cont.index.max():
303                 temp_cont.loc[ix, 'bin_low'] = rowdata['bin_low']
304             else:
305                 temp_cont.loc[ix, 'bin_up'] = rowdata['bin_up']
306             temp_cont = temp_cont.drop(i, axis=0)
307     # print("合并后 temp_cont.shape = ", temp_cont.shape)
308     return temp_cont.sort_values(by='bad_rate')
309 
310 
311 def cont_var_bin_map(x, bin_init):
312     """
313         按照初始化分箱结果，对原始值进行分箱映射
314         用于训练集与测试集的分箱映射
315     """
316     temp = x.copy()
317     # print("bin_init.index = ", bin_init.index)
318     for i in bin_init.index:
319         bin_up = bin_init['bin_up'][i]
320         bin_low = bin_init['bin_low'][i]
321         # 寻找出 >lower and <= upper的位置
322         if pd.isnull(bin_up) or pd.isnull(bin_low):
323             temp[pd.isnull(temp)] = i
324         else:
325             # index是series类型，返回的是true和false
326             index = (x > bin_low) & (x <= bin_up)
327             temp[index] = i
328     # series.name是设置series的名称
329     temp.name = temp.name + "_BIN"
330     return temp
331 
332 
333 def merge_bin(sub, i):
334     """
335     将相同箱内的样本书合并，区间合并
336     # 参数
337         sub:分箱结果子集，pandas dataframe ，如bin=1的结果
338         i: 分箱标号
339     # 返回值
340         返回合并结果
341     """
342     length = len(sub)
343     total = sub['total'].sum()
344     # 获取第1行值
345     first = sub.iloc[0, :]
346     # 获取最后一行值
347     last = sub.iloc[length - 1, :]
348 
349     lower = first['bin_low']
350     upper = last['bin_up']
351     df = pd.DataFrame()
352     df = df.append([i, lower, upper, total], ignore_index=True).T
353     df.columns = ['bin', 'bin_low', 'bin_up', 'total']
354     return df
355 
356 
357 # --------------------- 连续变量分箱函数 -------------------- #
358 def cont_var_bin(x,
359                  y,
360                  method,
361                  mmin=5,
362                  mmax=10,
363                  bin_rate=0.01,
364                  stop_limit=0.1,
365                  bin_min_num=20):
366     """
367         # 参数
368             x:输入分箱数据，pandas series
369             y:标签变量
370             method:分箱方法选择，1:chi-merge , 2:IV值, 3:基尼系数分箱
371             mmin:最小分箱数，当分箱初始化后如果初始化箱数小于等mmin，则mmin=2，即最少分2箱，
372                 如果分两箱也无法满足箱内最小样本数限制而分1箱，则变量删除
373             mmax:最大分箱数，当分箱初始化后如果初始化箱数小于等于mmax，则mmax等于初始化箱数-1
374             bin_rate：等距初始化分箱参数，分箱数为1/bin_rate,分箱间隔在数据中的最小值与最大值将等间隔取值
375             stop_limit:分箱earlystopping机制，如果已经没有明显增益即停止分箱
376             bin_min_num:每组最小样本数
377         # 返回值
378             分箱结果：pandas dataframe
379     """
380     # 简单的来说pandas只有两种数据类型，Series和DataFrame，Series你可以简单的理解为Excel中的行或者列，DataFrame可以理解为整个Excel表格
381 
382     # 缺失值单独取出来
383     df_na = pd.DataFrame({'x': x[pd.isnull(x)], 'y': y[pd.isnull(x)]})
384     y = y[~pd.isnull(x)]
385     x = x[~pd.isnull(x)]
386 
387     # 初始化分箱，等距的方式，后面加上约束条件,没有箱内样本数没有限制
388     # 返回的是 bin_num, bin_up, bin_low （shape=100*2 的 dataFrame）
389     bin_init = init_equal_bin(x, bin_rate)
390 
391     # 分箱映射，即按照初始化分箱结果，对原始值进行分箱映射
392     # 数据类型是series ( shape=(771,) )
393     bin_map = cont_var_bin_map(x, bin_init)
394 
395     # 把series转换成dataFrame数据类型，其中，axis=1表示列拼接，axis=0表示行拼接，列拼接的话对应的是横向拼接，行拼接的话就是对应纵向拼接
396     df_temp = pd.concat([x, y, bin_map], axis=1)
397     # 计算每个bin中好坏样本的频数
398     df_temp_1 = pd.crosstab(index=df_temp[bin_map.name], columns=y)
399     # dataframe中有行和列两个方向，在改名时，需要指明改名的是行还是列（默认是行）
400     # inplace表示将结果返回给原变量
401     df_temp_1.rename(columns=dict(zip([0, 1], ['good', 'bad'])), inplace=True)
402     # 返回的 amount_BIN, good, bad (shape = 97*2 的 DataFrame)
403 
404     # 计算每个bin中一共有多少样本
405     # df.groupby(..).count()  每组内，按列统计每组的成员数。每列的统计结果是一样的，所以只取一列数据
406     # loc函数：通过行索引 "Index" 中的具体值来获取数据，iloc函数：通过数值来取数据（如取第二行的数据）
407     # print("bin_map.name = ", bin_map.name)
408     df_temp_2 = pd.DataFrame(df_temp.groupby(bin_map.name).count().iloc[:, 0])
409     df_temp_2.columns = ['total']
410     # pd.merge 表示主键合并类似于关系型数据库的连接方式，
411     # left_index：左侧的行索引的用作连接键。right_index：右侧的行索引的用作连接键
412     # how：表示连接方式。left表示使用左侧的DataFrame的键，类似于SQL的左外连接。左表全部显示，右表显示与重叠数据行索引值相同的数据
413     df_temp_all = pd.merge(pd.concat([df_temp_1, df_temp_2], axis=1), bin_init, left_index=True, right_index=True, how='left')
414 
415     pd.set_option('display.max_rows', None)     # 显示所有行
416     pd.set_option('display.max_columns', None)  # 显示所有列
417 
418     # 做分箱上下限的整理，让候选点连续（因为会出现这个Bin中不存在一个样本的情况，所以做这个连续的处理）
419     for j in range(df_temp_all.shape[0] - 1):
420         # df_temp_all.index[i]是获取第几个index值得，前一个值得高限，后一个数得地线
421         # print(df_temp_all.bin_up.loc[df_temp_all.index[j]])
422         # print(df_temp_all.bin_low.loc[df_temp_all.index[j + 1]])
423         if df_temp_all.bin_low.loc[df_temp_all.index[j + 1]] != df_temp_all.bin_up.loc[df_temp_all.index[j]]:
424             # print("j = ", j)
425             df_temp_all.bin_low.loc[df_temp_all.index[j + 1]] = df_temp_all.bin_up.loc[df_temp_all.index[j]]
426         
427     # 离散变量中这个值为badrate,连续变量时为索引，索引值是分箱初始化时，箱内有变量的箱的索引
428     df_temp_all['bad_rate'] = df_temp_all.index
429     # df_temp_all 列名是 ('amount_BIN', 'good', 'bad', 'total', 'bin_up', 'bin_low', 'bad_rate') 的类型是dataFrame，shape = (97*6)
430     # 最小样本数限制，进行分箱合并
431     df_temp_all = limit_min_sample(df_temp_all, bin_min_num)
432     # df_temp_all 列名是 ('amount_BIN', 'good', 'bad', 'total', 'bin_up', 'bin_low', 'bad_rate') 的类型是dataFrame，shape = (27*6)
433 
434     # 将合并后的最大箱数与设定的箱数进行比较，这个应该是分箱数的最大值
435     if mmax >= df_temp_all.shape[0]:
436         mmax = df_temp_all.shape[0] - 1
437     if mmin >= df_temp_all.shape[0]:
438         gain_value_save0 = 0
439         gain_rate_save0 = 0
440         df_temp_all['bin'] = np.linspace(1, df_temp_all.shape[0], df_temp_all.shape[0], dtype=int)
441         data = df_temp_all[['bin_low', 'bin_up', 'total', 'bin']]
442         data.index = data['bin']
443     else:
444         # 增加新的一列，并且新增的列值都是1
445         df_temp_all['bin'] = 1
446         df_temp_all['bin_raw'] = range(1, len(df_temp_all) + 1)
447         df_temp_all['var'] = df_temp_all.index  # 初始化箱的编号
448         # df_temp_all 是 ['good', 'bad', 'total', 'bin_up', 'bin_low', 'bad_rate', 'bin', 'bin_raw', 'var'], shape=27*9
449         gain_1 = 1e-10
450         gain_rate_save0 = []
451         gain_value_save0 = []
452         # 分箱约束：最大分箱数限制
453         for i in range(1, mmax):
454             df_temp_all, gain_2 = select_split_point(df_temp_all, method=method)
455             gain_rate = gain_2 / gain_1 - 1   # ratio gain
456             gain_value_save0.append(np.round(gain_2, 4))
457             if i == 1:
458                 gain_rate_save0.append(0.5)
459             else:
460                 gain_rate_save0.append(np.round(gain_rate, 4))
461             gain_1 = gain_2
462             # 判断分箱数是否在最小分箱数和最大分箱数之间
463             if df_temp_all.bin.max() >= mmin and df_temp_all.bin.max() <= mmax:
464                 if gain_rate <= stop_limit or pd.isnull(gain_rate):
465                     break
466 
467         df_temp_all = df_temp_all.rename(columns={'var': 'oldbin'})
468         # drop之前的shape=(27*9)， drop之后的shape=(27*5)
469         temp_Map1 = df_temp_all.drop(['good', 'bad', 'bad_rate', 'bin_raw'], axis=1)
470         temp_Map1 = temp_Map1.sort_values(by=['bin', 'oldbin'])
471 
472         # get new lower, upper, bin, total for sub
473         data = pd.DataFrame()
474         for i in temp_Map1['bin'].unique():
475             # 得到这个新的分箱内的上下界
476             sub_Map = temp_Map1[temp_Map1['bin'] == i]
477             rowdata = merge_bin(sub_Map, i)
478             data = data.append(rowdata, ignore_index=True)
479 
480         # resort data
481         data = data.sort_values(by='bin_low')
482         data = data.drop('bin', axis=1)
483         mmax = df_temp_all.bin.max()
484         data['bin'] = range(1, mmax + 1)
485         data.index = data['bin']
486 
487     # 将缺失值的箱加过来，把缺失值单独作为一个箱
488     if len(df_na) > 0:
489         row_num = data.shape[0] + 1
490         data.loc[row_num, 'bin_low'] = np.nan
491         data.loc[row_num, 'bin_up'] = np.nan
492         data.loc[row_num, 'total'] = df_na.shape[0]
493         data.loc[row_num, 'bin'] = data.bin.max() + 1
494     return data, gain_value_save0, gain_rate_save0
495 
496 
497 def cal_bin_value(x, y, bin_min_num_0=10):
498     """
499     按变量类别进行分箱初始化，不满足最小样本数的箱进行合并
500     # 参数
501         x: 待分箱的离散变量 pandas Series
502         y: 标签变量
503         target: 正样本标识
504         bin_min_num_0：箱内的最小样本数限制
505     # 返回值
506         计算结果
507     """
508     # 按类别x计算y中0,1两种状态的样本数
509     df_temp = pd.crosstab(index=x, columns=y, margins=False)
510     df_temp.rename(columns=dict(zip([0, 1], ['good', 'bad'])), inplace=True)
511     # DataFrame.assign(**kwargs) 为DataFrame分配新列。返回一个新对象，该对象包含除新列之外的所有原始列。重新分配的现有列将被覆盖。
512     df_temp = df_temp.assign(total=lambda x: x['good'] + x['bad'],
513                              bin=1,
514                              var_name=df_temp.index).assign(bad_rate=lambda x: x['bad'] / x['total'])
515 
516     # 按照baterate排序
517     df_temp = df_temp.sort_values(by='bad_rate')
518     df_temp = df_temp.reset_index(drop=True)
519     # print(df_temp)
520     # 样本数不满足最小值进行合并
521     for i in df_temp.index:
522         # 获取这一行的数据
523         rowdata = df_temp.loc[i, :]
524         if i == df_temp.index.max():
525             # 如果是最后一个箱就，取倒数第二个值.ix是要与之合并的箱数
526             ix = df_temp[df_temp.index < i].index.max()
527         else:
528             # 否则就取大于i的最小的分箱值
529             ix = df_temp[df_temp.index > i].index.min()
530         # 如果0, 1, total项中样本的数量小于20则进行合并
531         # bin_min_num_0是箱内最小样本数限制
532         if any(rowdata[:3] <= bin_min_num_0):
533             # 与相邻的bin合并
534             df_temp.loc[ix, 'bad'] = df_temp.loc[ix, 'bad'] + rowdata['bad']
535             df_temp.loc[ix, 'good'] = df_temp.loc[ix, 'good'] + rowdata['good']
536             df_temp.loc[ix, 'total'] = df_temp.loc[ix, 'total'] + rowdata['total']
537             df_temp.loc[ix, 'bad_rate'] = df_temp.loc[ix, 'bad'] / df_temp.loc[ix, 'total']
538             # 将区间也进行合并
539             # print(str(rowdata['var_name']))
540             # print(str(df_temp.loc[ix, 'var_name']))
541             df_temp.loc[ix, 'var_name'] = str(rowdata['var_name']) + '%' + str(df_temp.loc[ix, 'var_name'])
542             # print(df_temp.loc[ix, 'var_name'])
543 
544             df_temp = df_temp.drop(i, axis=0)  # 删除原来的bin（行）
545 
546     # print(df_temp)
547     # 如果离散变量小于等于5，每个变量为一个箱
548     df_temp['bin_raw'] = range(1, df_temp.shape[0] + 1)
549     df_temp = df_temp.reset_index(drop=True)
550     return df_temp
551 
552 
553 def disc_var_bin(x,
554                  y,
555                  method=1,
556                  mmin=3,
557                  mmax=8,
558                  stop_limit=0.1,
559                  bin_min_num=20):
560     """
561     离散变量分箱方法，如果变量过于稀疏最好先编码在按连续变量分箱
562     # 参数：
563     x:输入分箱数据，pandas series
564     y:标签变量
565     method:分箱方法选择，1:chi-merge , 2:IV值, 3:信息熵
566     mmin:最小分箱数，当分箱初始化后如果初始化箱数小于等mmin，则mmin=2，即最少分2箱，
567          如果分两厢也无法满足箱内最小样本数限制而分1箱，则变量删除
568     mmax:最大分箱数，当分箱初始化后如果初始化箱数小于等于mmax，则mmax等于初始化箱数-1
569     stop_limit:分箱earlystopping机制，如果已经没有明显增益即停止分箱
570     bin_min_num:每组最小样本数
571     # 返回值
572     分箱结果：pandas dataframe
573     """
574     #    x = data_train.purpose
575     #    y = data_train.target
576     del_key = []
577     # 缺失值单独取出来
578     df_na = pd.DataFrame({'x': x[pd.isnull(x)], 'y': y[pd.isnull(x)]})
579     y = y[~pd.isnull(x)]
580     x = x[~pd.isnull(x)]
581     # 数据类型转化
582     # np.issubdtype()可以判断类型继承关系,'o'类型是object或Pandas对象，这是Python类型字符串
583     if np.issubdtype(x.dtype, np.int_):
584         x = x.astype('float').astype('str')
585     if np.issubdtype(x.dtype, np.float_):
586         x = x.astype('str')
587 
588     # 按照类别分箱，得到每个箱下的统计值
589     temp_cont = cal_bin_value(x, y, bin_min_num)
590     # print(temp_cont)
591     
592     # 如果去掉缺失值后离散变量的可能取值小于等于5不分箱
593     if len(x.unique()) > 5:
594         # 将合并后的最大箱数与设定的箱数进行比较，这个应该是分箱数的最大值
595         if mmax >= temp_cont.shape[0]:
596             mmax = temp_cont.shape[0] - 1
597         if mmin >= temp_cont.shape[0]:
598             mmin = 2
599             mmax = temp_cont.shape[0] - 1
600         if mmax == 1:
601             print('变量 {0}合并后分箱数为1，该变量删除'.format(x.name))
602             del_key.append(x.name)
603 
604         gain_1 = 1e-10
605         gain_value_save0 = []
606         gain_rate_save0 = []
607         for i in range(1, mmax):
608             temp_cont, gain_2 = select_split_point(temp_cont, method=method)
609             gain_rate = gain_2 / gain_1 - 1   # ratio gain
610             gain_value_save0.append(np.round(gain_2, 4))
611             if i == 1:
612                 gain_rate_save0.append(0.5)
613             else:
614                 gain_rate_save0.append(np.round(gain_rate, 4))
615             gain_1 = gain_2
616             # print("temp_cont.bin.max() = ", temp_cont.bin.max())
617             if temp_cont.bin.max() >= mmin and temp_cont.bin.max() <= mmax:
618                 if gain_rate <= stop_limit:
619                     break
620         
621         # 这时候temp_cont的shape是 (6, 7)
622         temp_cont = temp_cont.rename(columns={'var': x.name})
623         # 这时候temp_cont的shape是 (6, 3)
624         temp_cont = temp_cont.drop(['good', 'bad', 'bin_raw', 'bad_rate'], axis=1)
625     else:
626         # print("temp_cont = ", temp_cont)
627         temp_cont.bin = temp_cont.bin_raw
628         temp_cont = temp_cont[['total', 'bin', 'var_name']]
629         gain_value_save0 = []
630         gain_rate_save0 = []
631         del_key = []
632 
633     # 将缺失值的箱加过来
634     if len(df_na) > 0:
635         index_1 = temp_cont.shape[0] + 1
636         temp_cont.loc[index_1, 'total'] = df_na.shape[0]
637         temp_cont.loc[index_1, 'bin'] = temp_cont.bin.max() + 1
638         temp_cont.loc[index_1, 'var_name'] = 'NA'
639     temp_cont = temp_cont.reset_index(drop=True)
640     if temp_cont.shape[0] == 1:
641         del_key.append(x.name)
642     return temp_cont.sort_values(by='bin'), gain_value_save0, gain_rate_save0, del_key
643 
644 
645 def disc_var_bin_map(x, bin_map):
646     """
647     用离散变量分箱后的结果，对原始值进行分箱映射
648     # 参数
649         x: 待分箱映射的离散变量，pandas Series
650         bin_map:分箱映射字典， pandas dataframe
651     # 返回值
652         返回映射结果
653     """
654     # 数据类型转化
655     xx = x[~pd.isnull(x)]
656     if np.issubdtype(xx.dtype, np.int_):
657         x[~pd.isnull(x)] = xx.astype('float').astype('str')
658     if np.issubdtype(xx.dtype, np.float_):
659         x[~pd.isnull(x)] = xx.astype('str')
660     d = dict()
661     for i in bin_map.index:
662         for j in bin_map.loc[i, 'var_name'].split('%'):
663             if j != 'NA':
664                 d[j] = bin_map.loc[i, 'bin']
665 
666     # 不论是利用字典还是函数进行映射，pandas.series.map方法都是把对应的数据逐个当作参数传入到字典或函数中，得到映射后的值。
667     new_x = x.map(d)
668 
669     # 有缺失值要做映射
670     if sum(pd.isnull(new_x)) > 0:
671         index_1 = bin_map.index[bin_map.var_name == 'NA']
672         if len(index_1) > 0:
673             new_x[pd.isnull(new_x)] = bin_map.loc[index_1, 'bin'].tolist()
674     new_x.name = x.name + '_BIN'
675 
676     return new_x
677 
678 
679 if __name__ == '__main__':
680 
681     path = os.getcwd()
682     data_path = os.path.join(path, 'data')
683     file_name = 'german.csv'
684     # 读取数据
685     data_train, data_test = data_read(data_path, file_name)
686     print("data_train.shape = ", data_train.shape)
687     print("data_test.shape = ", data_test.shape)
688     
689     dict_cont_bin = {}
690     cont_name = ['duration', 'amount', 'income_rate', 'residence_info', 'age', 'num_credits', 'dependents']
691 
692     # ------------------------ 连续变量分箱 -------------------------- #
693     data_train.amount[1:30] = np.nan
694     # 注意，这里输入的变量就只有一个变量
695     data_test1, gain_value_save1, gain_rate_save1 = cont_var_bin(
696         data_train.amount,
697         data_train.target,
698         method=1,
699         mmin=4,
700         mmax=10,
701         bin_rate=0.01,
702         stop_limit=0.1,
703         bin_min_num=20)
704     
705     data_test2, gain_value_save2, gain_rate_save2 = cont_var_bin(
706         data_train.amount,
707         data_train.target,
708         method=2,
709         mmin=4,
710         mmax=10,
711         bin_rate=0.01,
712         stop_limit=0.1,
713         bin_min_num=20)
714 
715     data_test3, gain_value_save3, gain_rate_save3 = cont_var_bin(
716         data_train.amount,
717         data_train.target,
718         method=3,
719         mmin=4,
720         mmax=10,
721         bin_rate=0.01,
722         stop_limit=0.1,
723         bin_min_num=20)
724 
725     # 区分离散变量和连续变量批量进行分箱，把每个变量分箱的结果保存在字典中
726     for i in cont_name:
727         dict_cont_bin[i], gain_value_save, gain_rate_save = cont_var_bin(
728             data_train[i],
729             data_train.target,
730             method=1,
731             mmin=4,
732             mmax=10,
733             bin_rate=0.01,
734             stop_limit=0.1,
735             bin_min_num=20)
736     
737     # 训练数据分箱
738     # 连续变量分箱映射
739 #    ss = data_train[list( dict_cont_bin.keys())]
740     df_cont_bin_train = pd.DataFrame()
741     for i in dict_cont_bin.keys():
742         print("dict_cont_bin.keys = ", i)
743         df_cont_bin_train = pd.concat([
744             df_cont_bin_train,
745             cont_var_bin_map(data_train[i], dict_cont_bin[i])], axis=1)
746 
747     # ---------------------- 离散变量分箱 ---------------------- #
748     data_train.purpose[1:30] = np.nan
749     data_disc_test1, gain_value_save1, gain_rate_save1, del_key = disc_var_bin(
750         data_train.purpose,
751         data_train.target,
752         method=1,
753         mmin=4,
754         mmax=10,
755         stop_limit=0.1,
756         bin_min_num=10)
757 
758     data_disc_test2, gain_value_save2, gain_rate_save2, del_key = disc_var_bin(
759         data_train.purpose,
760         data_train.target,
761         method=2,
762         mmin=4,
763         mmax=10,
764         stop_limit=0.1,
765         bin_min_num=10)
766 
767     data_disc_test3, gain_value_save3, gain_rate_save3, del_key = disc_var_bin(
768         data_train.purpose,
769         data_train.target,
770         method=3,
771         mmin=4,
772         mmax=10,
773         stop_limit=0.1,
774         bin_min_num=10)
775 
776     pd.set_option('display.max_rows', 60)
777     pd.set_option('display.max_columns', 0)
778     dict_disc_bin = {}
779     del_key = []
780     # 找到离散变量
781     disc_name = [x for x in data_train.columns if x not in cont_name]
782     disc_name.remove('target')
783     for i in disc_name:
784         print("disc_name = ", i)
785         dict_disc_bin[i], gain_value_save, gain_rate_save, del_key_1 = disc_var_bin(
786                 data_train[i],
787                 data_train.target,
788                 method=1,
789                 mmin=3,
790                 mmax=8,
791                 stop_limit=0.1,
792                 bin_min_num=5)
793         if len(del_key_1) > 0:
794             del_key.extend(del_key_1)
795     # 删除分箱数只有1个的变量
796     if len(del_key) > 0:
797         for j in del_key:
798             del dict_disc_bin[j]
799 
800     # 训练数据分箱
801     # 离散变量分箱映射
802 #    ss = data_train[list( dict_disc_bin.keys())]
803     df_disc_bin_train = pd.DataFrame()
804     for i in dict_disc_bin.keys():
805         print("离散变量分箱映射: ", i)
806         df_disc_bin_train = pd.concat([
807             df_disc_bin_train,
808             disc_var_bin_map(data_train[i], dict_disc_bin[i])], axis=1)
809 
810     # 测试数据分箱
811     # 连续变量分箱映射
812     ss = data_test[list(dict_cont_bin.keys())]
813     df_cont_bin_test = pd.DataFrame()
814     for i in dict_cont_bin.keys():
815         df_cont_bin_test = pd.concat([
816             df_cont_bin_test,
817             cont_var_bin_map(data_test[i], dict_cont_bin[i])], axis=1)
818     # 离散变量分箱映射
819 #    ss = data_test[list( dict_disc_bin.keys())]
820     df_disc_bin_test = pd.DataFrame()
821     for i in dict_disc_bin.keys():
822         df_disc_bin_test = pd.concat([
823             df_disc_bin_test,
824             disc_var_bin_map(data_test[i], dict_disc_bin[i])], axis=1)
posted @ 2021-10-15 17:53 ttweixiao9999 阅读(1292) 评论(0) 编辑收藏举报
刷新页面返回顶部
ttweixiao9999

不要轻视他人,每个人都有我们值得学习的地方 Don't look down on others, everyone has something to learn from

数据分析中的变量分箱——德国信贷数据集（variable bin in data analysis -German credit datasets）

公告