制作评分卡
评分卡案例
数据预处理
%matplotlib inline
# 导入库
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
# 加载数据
data = pd.read_csv("./rankingcard.csv", index_col=0)
data.head()
SeriousDlqin2yrs | RevolvingUtilizationOfUnsecuredLines | age | NumberOfTime30-59DaysPastDueNotWorse | DebtRatio | MonthlyIncome | NumberOfOpenCreditLinesAndLoans | NumberOfTimes90DaysLate | NumberRealEstateLoansOrLines | NumberOfTime60-89DaysPastDueNotWorse | NumberOfDependents | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120.0 | 13 | 0 | 6 | 0 | 2.0 |
2 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600.0 | 4 | 0 | 0 | 0 | 1.0 |
3 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042.0 | 2 | 1 | 0 | 0 | 0.0 |
4 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300.0 | 5 | 0 | 0 | 0 | 0.0 |
5 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588.0 | 7 | 0 | 1 | 0 | 0.0 |
# 删除重复值
data.drop_duplicates(inplace=True)
# 查看缺失值占比
data.isnull().sum() / data.shape[0]
SeriousDlqin2yrs 0.000000
RevolvingUtilizationOfUnsecuredLines 0.000000
age 0.000000
NumberOfTime30-59DaysPastDueNotWorse 0.000000
DebtRatio 0.000000
MonthlyIncome 0.195601
NumberOfOpenCreditLinesAndLoans 0.000000
NumberOfTimes90DaysLate 0.000000
NumberRealEstateLoansOrLines 0.000000
NumberOfTime60-89DaysPastDueNotWorse 0.000000
NumberOfDependents 0.025624
dtype: float64
# 删除缺失值
data.drop(index=data[data["NumberOfDependents"].isna() == True].index, inplace=True)
# 重建索引
data.index = range(data.shape[0])
data
type(data)
pandas.core.frame.DataFrame
# 封装填充缺失值函数
def fill_miss_rfr(x, y, to_fill: str, n_estimators: int) -> np.ndarray:
"""
Args:
x (DataFram): 要填补的特征矩阵
y (Serise): 完整的,没有缺失值的标签
to_fill (str): 字符串,要填补的那一列的名称
n_estimators (int):随机森林树的数量
"""
# 构建新特征矩阵和新标签
df = x.copy()
fill = df.loc[:, to_fill]
df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1)
# 划分训练集和测试集
y_train = fill[fill.notna()]
y_test = fill[fill.isna()]
x_train = df.iloc[y_train.index, :]
x_test = df.iloc[y_test.index, :]
# 使用随机森林填充缺失值
from sklearn.ensemble import RandomForestRegressor as RFR
rfr = RFR(n_estimators=n_estimators)
rfr.fit(x_train, y_train)
y_predict = rfr.predict(x_test)
return y_predict
# 准备参数
x = data.iloc[:, 1:]
y = data.iloc[:, 1]
# 获取缺失值MonthlyIncome
y_predict = fill_miss_rfr(x, y, "MonthlyIncome", 100)
# 填补缺失值MonthlyIncome
data.loc[data.loc[:, "MonthlyIncome"].isna(), "MonthlyIncome"] = y_predict
描述性统计处理异常值
data.describe([0.25, 0.5, 0.75, 0.90]).T
count | mean | std | min | 25% | 50% | 75% | 90% | max | |
---|---|---|---|---|---|---|---|---|---|
SeriousDlqin2yrs | 145563.0 | 0.067538 | 0.250952 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.0 |
RevolvingUtilizationOfUnsecuredLines | 145563.0 | 5.941378 | 250.510835 | 0.0 | 0.031218 | 0.158817 | 0.561085 | 0.976557 | 50708.0 |
age | 145563.0 | 52.110701 | 14.567652 | 0.0 | 41.000000 | 52.000000 | 62.000000 | 72.000000 | 107.0 |
NumberOfTime30-59DaysPastDueNotWorse | 145563.0 | 0.389185 | 3.756944 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 98.0 |
DebtRatio | 145563.0 | 334.548251 | 1947.228209 | 0.0 | 0.173934 | 0.359090 | 0.770641 | 1150.000000 | 329664.0 |
MonthlyIncome | 145563.0 | 5552.682762 | 13384.682061 | 0.0 | 2000.000000 | 4500.000000 | 7500.000000 | 10841.800000 | 3008750.0 |
NumberOfOpenCreditLinesAndLoans | 145563.0 | 8.553788 | 5.141132 | 0.0 | 5.000000 | 8.000000 | 11.000000 | 15.000000 | 58.0 |
NumberOfTimes90DaysLate | 145563.0 | 0.231309 | 3.728803 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 98.0 |
NumberRealEstateLoansOrLines | 145563.0 | 1.033346 | 1.133115 | 0.0 | 0.000000 | 1.000000 | 2.000000 | 2.000000 | 54.0 |
NumberOfTime60-89DaysPastDueNotWorse | 145563.0 | 0.205622 | 3.712455 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 98.0 |
NumberOfDependents | 145563.0 | 0.759863 | 1.116141 | 0.0 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 20.0 |
# 一个年龄为0的人,需要将他删掉
(data["age"] == 0).sum()
data = data[data["age"] != 0]
data = data[data.loc[:, "NumberOfTimes90DaysLate"] < 90]
# 重置索引
data.index = range(data.shape[0])
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145354 entries, 0 to 145353
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SeriousDlqin2yrs 145354 non-null int64
1 RevolvingUtilizationOfUnsecuredLines 145354 non-null float64
2 age 145354 non-null int64
3 NumberOfTime30-59DaysPastDueNotWorse 145354 non-null int64
4 DebtRatio 145354 non-null float64
5 MonthlyIncome 145354 non-null float64
6 NumberOfOpenCreditLinesAndLoans 145354 non-null int64
7 NumberOfTimes90DaysLate 145354 non-null int64
8 NumberRealEstateLoansOrLines 145354 non-null int64
9 NumberOfTime60-89DaysPastDueNotWorse 145354 non-null int64
10 NumberOfDependents 145354 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.2 MB
查看样本均衡问题
x = data.iloc[:, 1:]
y = data.iloc[:, 0]
sample_n = y.value_counts().sum()
sample_0 = y.value_counts()[0]
sample_1 = y.value_counts()[1]
print(
"样本总数: {}, 样本0占比:{:.2%}, 样本1占比:{:.2%}".format(
sample_n, sample_0 / sample_n, sample_1 / sample_n
)
)
样本总数: 145354, 样本0占比:93.32%, 样本1占比:6.68%
# 增加样本1的占比
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
x, y = sm.fit_resample(x, y)
sample_n = y.value_counts().sum()
sample_0 = y.value_counts()[0]
sample_1 = y.value_counts()[1]
print(
"样本总数: {}, 样本0占比:{:.2%}, 样本1占比:{:.2%}".format(
sample_n, sample_0 / sample_n, sample_1 / sample_n
)
)
样本总数: 271296, 样本0占比:50.00%, 样本1占比:50.00%
data
SeriousDlqin2yrs | RevolvingUtilizationOfUnsecuredLines | age | NumberOfTime30-59DaysPastDueNotWorse | DebtRatio | MonthlyIncome | NumberOfOpenCreditLinesAndLoans | NumberOfTimes90DaysLate | NumberRealEstateLoansOrLines | NumberOfTime60-89DaysPastDueNotWorse | NumberOfDependents | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120.0 | 13 | 0 | 6 | 0 | 2.0 |
1 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600.0 | 4 | 0 | 0 | 0 | 1.0 |
2 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042.0 | 2 | 1 | 0 | 0 | 0.0 |
3 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300.0 | 5 | 0 | 0 | 0 | 0.0 |
4 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588.0 | 7 | 0 | 1 | 0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
145349 | 0 | 0.040674 | 74 | 0 | 0.225131 | 2100.0 | 4 | 0 | 1 | 0 | 0.0 |
145350 | 0 | 0.299745 | 44 | 0 | 0.716562 | 5584.0 | 4 | 0 | 1 | 0 | 2.0 |
145351 | 0 | 0.246044 | 58 | 0 | 3870.000000 | 0.0 | 18 | 0 | 1 | 0 | 0.0 |
145352 | 0 | 0.000000 | 30 | 0 | 0.000000 | 5716.0 | 4 | 0 | 0 | 0 | 0.0 |
145353 | 0 | 0.850283 | 64 | 0 | 0.249908 | 8158.0 | 8 | 0 | 2 | 0 | 0.0 |
145354 rows × 11 columns
划分测试集和训练集并为分箱做准备
from sklearn.model_selection import train_test_split as tts
x_train, x_vali, y_train, y_vali = tts(x, y, test_size=0.3, random_state=420)
model_data = pd.concat([y_train, x_train], axis=1)
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns
model_vali = pd.concat([y_vali, x_vali], axis=1)
model_vali.index = range(model_vali.shape[0])
model_vali.columns = data.columns
model_data.to_csv("./model_data.csv")
model_vali.to_csv("./model_vali.csv")
分箱
等频分箱
# 按照等频对箱子进行分箱
model_data["qcut"], updown = pd.qcut(model_data["age"], retbins=True, q=20)
coount_0 = (
model_data[model_data["SeriousDlqin2yrs"] == 0]
.groupby(by=model_data["qcut"])
.count()["SeriousDlqin2yrs"]
)
coount_1 = (
model_data[model_data["SeriousDlqin2yrs"] == 1]
.groupby(by=model_data["qcut"])
.count()["SeriousDlqin2yrs"]
)
num_bins = [*zip(updown, updown[1:], coount_0, coount_1)]
num_bins
[(21.0, 28.0, 4121, 7178),
(28.0, 31.0, 3476, 5783),
(31.0, 34.0, 3933, 6662),
(34.0, 36.0, 2896, 4652),
(36.0, 39.0, 5108, 7417),
(39.0, 41.0, 3963, 5843),
(41.0, 43.0, 3957, 5558),
(43.0, 45.0, 4317, 5691),
(45.0, 46.0, 2342, 3240),
(46.0, 48.0, 4864, 6150),
(48.0, 50.0, 4861, 6038),
(50.0, 52.0, 4621, 5523),
(52.0, 54.0, 4560, 4623),
(54.0, 56.0, 4498, 4010),
(56.0, 58.0, 4468, 3359),
(58.0, 61.0, 6451, 4746),
(61.0, 63.0, 4712, 2228),
(63.0, 67.0, 6974, 2578),
(67.0, 73.0, 6975, 2041),
(73.0, 107.0, 7998, 1492)]
定义 WOE 和 IV 函数
# 获取WOE
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(num_bins, columns=columns)
df["tatol"] = df.count_0 + df.count_1
df["percentage"] = df.tatol / df.tatol.sum()
df["bad_rate"] = df.count_1 / df.tatol
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_1.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])
# 获取IV
rate = df["good%"] - df["bad%"]
iv = np.sum(rate * df["woe"])
# 封装 WOE 和 IV 函数
def get_woe(data: list) -> pd.DataFrame:
"""通过 num_bins 数据计算 woe
Args:
data (list): _description_
Returns:
pd.DataFrame: _description_
"""
df = data.copy()
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(df, columns=columns)
df["tatol"] = df.count_0 + df.count_1
df["percentage"] = df.tatol / df.tatol.sum()
df["bad_rate"] = df.count_1 / df.tatol
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_1.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])
return df
def get_iv(data: pd.DataFrame) -> float:
"""计算IV值
Args:
data (pd.DataFrame): _description_
Returns:
float: _description_
"""
rate = data["good%"] - data["bad%"]
iv = np.sum(rate * data["woe"])
return iv
get_iv(get_woe(num_bins))
0.3451906890667462
num_bins_ = num_bins.copy()
[*zip(range(len(num_bins_)), num_bins_)]
[(0, (21.0, 28.0, 4121, 7178)),
(1, (28.0, 31.0, 3476, 5783)),
(2, (31.0, 34.0, 3933, 6662)),
(3, (34.0, 36.0, 2896, 4652)),
(4, (36.0, 39.0, 5108, 7417)),
(5, (39.0, 41.0, 3963, 5843)),
(6, (41.0, 43.0, 3957, 5558)),
(7, (43.0, 45.0, 4317, 5691)),
(8, (45.0, 46.0, 2342, 3240)),
(9, (46.0, 48.0, 4864, 6150)),
(10, (48.0, 50.0, 4861, 6038)),
(11, (50.0, 52.0, 4621, 5523)),
(12, (52.0, 54.0, 4560, 4623)),
(13, (54.0, 56.0, 4498, 4010)),
(14, (56.0, 58.0, 4468, 3359)),
(15, (58.0, 61.0, 6451, 4746)),
(16, (61.0, 63.0, 4712, 2228)),
(17, (63.0, 67.0, 6974, 2578)),
(18, (67.0, 73.0, 6975, 2041)),
(19, (73.0, 107.0, 7998, 1492))]
卡方检验,合并箱体,画出 IV 曲线
# 卡方检验
import scipy.stats as ss
x1 = num_bins_[0][2:]
x2 = num_bins_[1][2:]
pvs = ss.chi2_contingency([x1, x2])[1]
pvs_list = []
for i in range(len(num_bins_) - 1):
x1 = num_bins_[i][2:]
x2 = num_bins_[i + 1][2:]
pvs = ss.chi2_contingency([x1, x2])[1]
pvs_list.append(pvs)
pvs_list
pvs_list
[0.11728724900429034,
0.5508163222314948,
0.09053121716062464,
0.0007530919909110494,
0.5874212014912026,
0.10044784335748964,
0.029701216027069655,
0.15850408664347226,
0.007103486255692831,
0.5224541867703053,
0.16902174569691014,
1.2731738769229555e-08,
2.1014725165904846e-05,
6.866926043454617e-08,
0.476802189877255,
1.944497287524474e-43,
1.0901328872969334e-12,
8.002706776208699e-12,
6.800451443744261e-33]
# 合并卡方检验相似的两个箱子
i = pvs_list.index(max(pvs_list))
num_bins_[i : i + 2] = [
(
num_bins_[i][0],
num_bins_[i + 2][0],
num_bins_[i][2] + num_bins_[i + 1][2],
num_bins_[i][3] + num_bins_[i + 1][3],
)
]
import matplotlib.pyplot as plt
IV = []
axisx = []
while len(num_bins_) > 2:
pvs = []
# 获取 num_bins_两两之间的卡方检验的置信度(或卡方值)
for i in range(len(num_bins_) - 1):
x1 = num_bins_[i][2:]
x2 = num_bins_[i + 1][2:]
# 0 返回 chi2 值,1 返回 p 值。
pv = ss.chi2_contingency([x1, x2])[1]
# chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
pvs.append(pv)
# 通过 p 值进行处理。合并 p 值最大的两组
i = pvs.index(max(pvs))
num_bins_[i : i + 2] = [
(
num_bins_[i][0],
num_bins_[i + 1][1],
num_bins_[i][2] + num_bins_[i + 1][2],
num_bins_[i][3] + num_bins_[i + 1][3],
)
]
bins_df = get_woe(num_bins_)
axisx.append(len(num_bins_))
IV.append(get_iv(bins_df))
plt.figure()
plt.plot(axisx, IV)
plt.xticks(axisx)
plt.xlabel("number of box")
plt.ylabel("IV")
plt.show()
# 封装分箱函数
def get_bin(num_bins_: list, n: str) -> list:
"""分箱函数
Args:
num_bins (list): _description_
n (str): _description_
Returns:
list: _description_
"""
while len(num_bins_) > n:
pvs = []
# 获取 num_bins_两两之间的卡方检验的置信度(或卡方值)
for i in range(len(num_bins_) - 1):
x1 = num_bins_[i][2:]
x2 = num_bins_[i + 1][2:]
# 0 返回 chi2 值,1 返回 p 值。
pv = ss.chi2_contingency([x1, x2])[1]
# chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
pvs.append(pv)
# 通过 p 值进行处理。合并 p 值最大的两组
i = pvs.index(max(pvs))
num_bins_[i : i + 2] = [
(
num_bins_[i][0],
num_bins_[i + 1][1],
num_bins_[i][2] + num_bins_[i + 1][2],
num_bins_[i][3] + num_bins_[i + 1][3],
)
]
return num_bins_
get_woe(get_bin(num_bins, 5))
min | max | count_0 | count_1 | tatol | percentage | bad_rate | good% | bad% | woe | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 21.0 | 36.0 | 14426 | 24275 | 38701 | 0.203789 | 0.627245 | 0.151701 | 0.256033 | -0.523395 |
1 | 36.0 | 52.0 | 34033 | 45460 | 79493 | 0.418589 | 0.571874 | 0.357884 | 0.479475 | -0.292483 |
2 | 52.0 | 61.0 | 19977 | 16738 | 36715 | 0.193331 | 0.455890 | 0.210074 | 0.176539 | 0.173920 |
3 | 61.0 | 73.0 | 18661 | 6847 | 25508 | 0.134318 | 0.268426 | 0.196235 | 0.072217 | 0.999645 |
4 | 73.0 | 107.0 | 7998 | 1492 | 9490 | 0.049972 | 0.157218 | 0.084105 | 0.015736 | 1.676094 |
import scipy
def graphforbestbin(DF, X, Y, n=5, q=20, graph=True):
"""
自动最优分箱函数,基于卡方检验的分箱
参数:
DF: 需要输入的数据
X: 需要分箱的列名
Y: 分箱数据对应的标签 Y 列名
n: 保留分箱个数
q: 初始分箱的个数
graph: 是否要画出IV图像
区间为前开后闭 (]
"""
global bins_df
DF = DF[[X, Y]].copy()
DF["qcut"], bins = pd.qcut(DF[X], retbins=True, q=q, duplicates="drop")
coount_y0 = DF.loc[DF[Y] == 0].groupby(by="qcut").count()[Y]
coount_y1 = DF.loc[DF[Y] == 1].groupby(by="qcut").count()[Y]
num_bins = [*zip(bins, bins[1:], coount_y0, coount_y1)]
for i in range(q):
if 0 in num_bins[0][2:]:
num_bins[0:2] = [
(
num_bins[0][0],
num_bins[1][1],
num_bins[0][2] + num_bins[1][2],
num_bins[0][3] + num_bins[1][3],
)
]
continue
for i in range(len(num_bins)):
if 0 in num_bins[i][2:]:
num_bins[i - 1 : i + 1] = [
(
num_bins[i - 1][0],
num_bins[i][1],
num_bins[i - 1][2] + num_bins[i][2],
num_bins[i - 1][3] + num_bins[i][3],
)
]
break
else:
break
def get_woe(num_bins):
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(num_bins, columns=columns)
df["total"] = df.count_0 + df.count_1
df["percentage"] = df.total / df.total.sum()
df["bad_rate"] = df.count_1 / df.total
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_1.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])
return df
def get_iv(df):
rate = df["good%"] - df["bad%"]
iv = np.sum(rate * df.woe)
return iv
IV = []
axisx = []
while len(num_bins) > n:
pvs = []
for i in range(len(num_bins) - 1):
x1 = num_bins[i][2:]
x2 = num_bins[i + 1][2:]
pv = scipy.stats.chi2_contingency([x1, x2])[1]
pvs.append(pv)
i = pvs.index(max(pvs))
num_bins[i : i + 2] = [
(
num_bins[i][0],
num_bins[i + 1][1],
num_bins[i][2] + num_bins[i + 1][2],
num_bins[i][3] + num_bins[i + 1][3],
)
]
bins_df = pd.DataFrame(get_woe(num_bins))
axisx.append(len(num_bins))
IV.append(get_iv(bins_df))
if graph:
plt.figure()
plt.plot(axisx, IV)
plt.xticks(axisx)
plt.xlabel("number of box")
plt.ylabel("IV")
plt.show()
return bins_df
dfs = []
for i in model_data.columns[1:-1]:
print(i)
a = graphforbestbin(DF=model_data, X=i, Y="SeriousDlqin2yrs", graph=False)
dfs.append(a)
RevolvingUtilizationOfUnsecuredLines
age
NumberOfTime30-59DaysPastDueNotWorse
DebtRatio
MonthlyIncome
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberRealEstateLoansOrLines
NumberOfTime60-89DaysPastDueNotWorse
NumberOfDependents
for i in dfs:
print(i)
min max count_0 count_1 total percentage bad_rate \
0 0.000000 0.102453 42746 4731 47477 0.250001 0.099648
1 0.102453 0.224921 13207 5784 18991 0.100002 0.304565
2 0.224921 0.553524 17987 19994 37981 0.199998 0.526421
3 0.553524 1.000000 19729 53108 72837 0.383540 0.729135
4 1.000000 29110.000000 1426 11195 12621 0.066459 0.887014
good% bad% woe
0 0.449508 0.049899 2.198159
1 0.138882 0.061005 0.822671
2 0.189148 0.210880 -0.108763
3 0.207466 0.560140 -0.993218
4 0.014996 0.118076 -2.063574
min max count_0 count_1 total percentage bad_rate good% \
0 21.0 36.0 14426 24275 38701 0.203789 0.627245 0.151701
1 36.0 52.0 34033 45460 79493 0.418589 0.571874 0.357884
2 52.0 61.0 19977 16738 36715 0.193331 0.455890 0.210074
3 61.0 73.0 18661 6847 25508 0.134318 0.268426 0.196235
4 73.0 107.0 7998 1492 9490 0.049972 0.157218 0.084105
bad% woe
0 0.256033 -0.523395
1 0.479475 -0.292483
2 0.176539 0.173920
3 0.072217 0.999645
4 0.015736 1.676094
min max count_0 count_1 total percentage bad_rate good% \
0 21.0 36.0 14426 24275 38701 0.203789 0.627245 0.151701
1 36.0 52.0 34033 45460 79493 0.418589 0.571874 0.357884
2 52.0 61.0 19977 16738 36715 0.193331 0.455890 0.210074
3 61.0 73.0 18661 6847 25508 0.134318 0.268426 0.196235
4 73.0 107.0 7998 1492 9490 0.049972 0.157218 0.084105
bad% woe
0 0.256033 -0.523395
1 0.479475 -0.292483
2 0.176539 0.173920
3 0.072217 0.999645
4 0.015736 1.676094
min max count_0 count_1 total percentage bad_rate \
0 0.000000 0.017546 7816 1680 9496 0.050003 0.176917
1 0.017546 0.490044 52595 51853 104448 0.549996 0.496448
2 0.490044 1.120358 14828 23153 37981 0.199998 0.609594
3 1.120358 837.000000 8988 10006 18994 0.100017 0.526798
4 837.000000 329664.000000 10868 8120 18988 0.099986 0.427639
good% bad% woe
0 0.082191 0.017719 1.534399
1 0.553079 0.546903 0.011228
2 0.155928 0.244199 -0.448587
3 0.094516 0.105535 -0.110275
4 0.114286 0.085643 0.288512
min max count_0 count_1 total percentage bad_rate \
0 0.000000 9.000000e-02 6585 3001 9586 0.050477 0.313061
1 0.090000 1.419000e+03 13754 14643 28397 0.149531 0.515653
2 1419.000000 4.667000e+03 27842 38638 66480 0.350066 0.581197
3 4667.000000 6.238526e+03 13996 14476 28472 0.149926 0.508429
4 6238.526439 3.008750e+06 32918 24054 56972 0.299999 0.422207
good% bad% woe
0 0.069247 0.031652 0.782868
1 0.144634 0.154442 -0.065613
2 0.292781 0.407522 -0.330671
3 0.147179 0.152681 -0.036701
4 0.346159 0.253702 0.310738
min max count_0 count_1 total percentage bad_rate good% \
0 0.0 1.0 3104 7758 10862 0.057196 0.714233 0.032641
1 1.0 3.0 9412 13478 22890 0.120533 0.588816 0.098975
2 3.0 5.0 15485 16514 31999 0.168498 0.516079 0.162837
3 5.0 17.0 61589 53679 115268 0.606971 0.465689 0.647658
4 17.0 58.0 5505 3383 8888 0.046802 0.380626 0.057889
bad% woe
0 0.081825 -0.919013
1 0.142155 -0.362054
2 0.174176 -0.067317
3 0.566163 0.134481
4 0.035681 0.483913
min max count_0 count_1 total percentage bad_rate good% \
0 0.0 1.0 3104 7758 10862 0.057196 0.714233 0.032641
1 1.0 3.0 9412 13478 22890 0.120533 0.588816 0.098975
2 3.0 5.0 15485 16514 31999 0.168498 0.516079 0.162837
3 5.0 17.0 61589 53679 115268 0.606971 0.465689 0.647658
4 17.0 58.0 5505 3383 8888 0.046802 0.380626 0.057889
bad% woe
0 0.081825 -0.919013
1 0.142155 -0.362054
2 0.174176 -0.067317
3 0.566163 0.134481
4 0.035681 0.483913
min max count_0 count_1 total percentage bad_rate good% \
0 0.0 1.0 3104 7758 10862 0.057196 0.714233 0.032641
1 1.0 3.0 9412 13478 22890 0.120533 0.588816 0.098975
2 3.0 5.0 15485 16514 31999 0.168498 0.516079 0.162837
3 5.0 17.0 61589 53679 115268 0.606971 0.465689 0.647658
4 17.0 58.0 5505 3383 8888 0.046802 0.380626 0.057889
bad% woe
0 0.081825 -0.919013
1 0.142155 -0.362054
2 0.174176 -0.067317
3 0.566163 0.134481
4 0.035681 0.483913
min max count_0 count_1 total percentage bad_rate good% \
0 0.0 1.0 3104 7758 10862 0.057196 0.714233 0.032641
1 1.0 3.0 9412 13478 22890 0.120533 0.588816 0.098975
2 3.0 5.0 15485 16514 31999 0.168498 0.516079 0.162837
3 5.0 17.0 61589 53679 115268 0.606971 0.465689 0.647658
4 17.0 58.0 5505 3383 8888 0.046802 0.380626 0.057889
bad% woe
0 0.081825 -0.919013
1 0.142155 -0.362054
2 0.174176 -0.067317
3 0.566163 0.134481
4 0.035681 0.483913
min max count_0 count_1 total percentage bad_rate good% \
0 0.0 1.0 3104 7758 10862 0.057196 0.714233 0.032641
1 1.0 3.0 9412 13478 22890 0.120533 0.588816 0.098975
2 3.0 5.0 15485 16514 31999 0.168498 0.516079 0.162837
3 5.0 17.0 61589 53679 115268 0.606971 0.465689 0.647658
4 17.0 58.0 5505 3383 8888 0.046802 0.380626 0.057889
bad% woe
0 0.081825 -0.919013
1 0.142155 -0.362054
2 0.174176 -0.067317
3 0.566163 0.134481
4 0.035681 0.483913
auto_col_bins = {
"RevolvingUtilizationOfUnsecuredLines": 6,
"age": 5,
"DebtRatio": 4,
"MonthlyIncome": 3,
"NumberOfOpenCreditLinesAndLoans": 5,
}
# 不能使用自动分箱的变量
hand_bins = {
"NumberOfTime30-59DaysPastDueNotWorse": [0, 1, 2, 13],
"NumberOfTimes90DaysLate": [0, 1, 2, 17],
"NumberRealEstateLoansOrLines": [0, 1, 2, 4, 54],
"NumberOfTime60-89DaysPastDueNotWorse": [0, 1, 2, 8],
"NumberOfDependents": [0, 1, 2, 3],
}
# 保证区间覆盖使用 np.inf替换最大值,用-np.inf替换最小值
hand_bins = {k: [-np.inf, *v[:-1], np.inf] for k, v in hand_bins.items()}
bins_of_col = {}
# 生成自动分箱的分箱区间和分箱后的 IV 值
for col in auto_col_bins:
bins_df = graphforbestbin(
model_data,
col,
"SeriousDlqin2yrs",
n=auto_col_bins[col]
# 使用字典的性质来取出每个特征所对应的箱的数量
,
q=20,
graph=False,
)
bins_list = sorted(set(bins_df["min"]).union(bins_df["max"]))
# 保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
bins_list[0], bins_list[-1] = -np.inf, np.inf
bins_of_col[col] = bins_list
# 合并手动分箱数据
bins_of_col.update(hand_bins)
bins_of_col
{'RevolvingUtilizationOfUnsecuredLines': [-inf,
0.1024533425,
0.2249209732,
0.5535236390041033,
0.9829105421203392,
0.9999999,
inf],
'age': [-inf, 36.0, 52.0, 61.0, 73.0, inf],
'DebtRatio': [-inf, 0.017546052905140292, 0.4900438860450557, 837.0, inf],
'MonthlyIncome': [-inf, 0.09, 6238.526438865782, inf],
'NumberOfOpenCreditLinesAndLoans': [-inf, 1.0, 3.0, 5.0, 17.0, inf],
'NumberOfTime30-59DaysPastDueNotWorse': [-inf, 0, 1, 2, inf],
'NumberOfTimes90DaysLate': [-inf, 0, 1, 2, inf],
'NumberRealEstateLoansOrLines': [-inf, 0, 1, 2, 4, inf],
'NumberOfTime60-89DaysPastDueNotWorse': [-inf, 0, 1, 2, inf],
'NumberOfDependents': [-inf, 0, 1, 2, inf]}
data = model_data.copy()
data["cut"] = pd.cut(data["age"], bins_of_col["age"])
data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df = data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df
SeriousDlqin2yrs | 0 | 1 |
---|---|---|
cut | ||
(-inf, 36.0] | 14426 | 24275 |
(36.0, 52.0] | 34033 | 45460 |
(52.0, 61.0] | 19977 | 16738 |
(61.0, 73.0] | 18661 | 6847 |
(73.0, inf] | 7998 | 1492 |
bins_df[0]
cut
(-inf, 36.0] 14426
(36.0, 52.0] 34033
(52.0, 61.0] 19977
(61.0, 73.0] 18661
(73.0, inf] 7998
Name: 0, dtype: int64
bins_df["woe"] = np.log(
(bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum())
)
bins_df
SeriousDlqin2yrs | 0 | 1 | woe |
---|---|---|---|
cut | |||
(-inf, 36.0] | 14426 | 24275 | -0.523395 |
(36.0, 52.0] | 34033 | 45460 | -0.292483 |
(52.0, 61.0] | 19977 | 16738 | 0.173920 |
(61.0, 73.0] | 18661 | 6847 | 0.999645 |
(73.0, inf] | 7998 | 1492 | 1.676094 |
def get_woe(df, col, y, bins):
df = df[[col, y]].copy()
df["cut"] = pd.cut(df[col], bins)
bins_df = df.groupby("cut")[y].value_counts().unstack()
woe = bins_df["woe"] = np.log(
(bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum())
)
return woe
# 将所有特征的WOE存储到字典当中
woeall = {}
for col in bins_of_col:
woeall[col] = get_woe(model_data, col, "SeriousDlqin2yrs", bins_of_col[col])
woeall
{'RevolvingUtilizationOfUnsecuredLines': cut
(-inf, 0.102] 2.198159
(0.102, 0.225] 0.822671
(0.225, 0.554] -0.108763
(0.554, 0.983] -1.156168
(0.983, 1.0] -0.483896
(1.0, inf] -2.063574
dtype: float64,
'age': cut
(-inf, 36.0] -0.523395
(36.0, 52.0] -0.292483
(52.0, 61.0] 0.173920
(61.0, 73.0] 0.999645
(73.0, inf] 1.676094
dtype: float64,
'DebtRatio': cut
(-inf, 0.0175] 1.534399
(0.0175, 0.49] 0.011228
(0.49, 837.0] -0.333937
(837.0, inf] 0.288512
dtype: float64,
'MonthlyIncome': cut
(-inf, 0.09] 0.782868
(0.09, 6238.526] -0.200869
(6238.526, inf] 0.310738
dtype: float64,
'NumberOfOpenCreditLinesAndLoans': cut
(-inf, 1.0] -0.919013
(1.0, 3.0] -0.362054
(3.0, 5.0] -0.067317
(5.0, 17.0] 0.134481
(17.0, inf] 0.483913
dtype: float64,
'NumberOfTime30-59DaysPastDueNotWorse': cut
(-inf, 0.0] 0.351043
(0.0, 1.0] -0.868353
(1.0, 2.0] -1.363561
(2.0, inf] -1.482353
dtype: float64,
'NumberOfTimes90DaysLate': cut
(-inf, 0.0] 0.235054
(0.0, 1.0] -1.737703
(1.0, 2.0] -2.281241
(2.0, inf] -2.377136
dtype: float64,
'NumberRealEstateLoansOrLines': cut
(-inf, 0.0] -0.408144
(0.0, 1.0] 0.195483
(1.0, 2.0] 0.650412
(2.0, 4.0] 0.372983
(4.0, inf] -0.297756
dtype: float64,
'NumberOfTime60-89DaysPastDueNotWorse': cut
(-inf, 0.0] 0.124264
(0.0, 1.0] -1.375012
(1.0, 2.0] -1.807985
(2.0, inf] -1.792286
dtype: float64,
'NumberOfDependents': cut
(-inf, 0.0] 0.636441
(0.0, 1.0] -0.586082
(1.0, 2.0] -0.510146
(2.0, inf] -0.451625
dtype: float64}
建模与模型验证
# 不希望覆盖掉原本的数据,创建一个新的DataFrame,索引和原始数据model_data一模一样
model_woe = pd.DataFrame(index=model_data.index)
# 将原数据分箱后,按箱的结果把WOE结构用map函数映射到数据中
model_woe["age"] = pd.cut(model_data["age"], bins_of_col["age"]).map(woeall["age"])
model_woe
age | |
---|---|
0 | 0.173920 |
1 | 0.999645 |
2 | -0.292483 |
3 | 0.173920 |
4 | -0.292483 |
... | ... |
189902 | 0.173920 |
189903 | -0.292483 |
189904 | -0.523395 |
189905 | 0.999645 |
189906 | -0.523395 |
189907 rows × 1 columns
# 对所有特征操作可以写成:
for col in bins_of_col:
model_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])
# 将标签补充到数据中
model_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]
vali_woe = pd.DataFrame(index=model_data.index)
for col in bins_of_col:
vali_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])
vali_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]
vali_X = vali_woe.iloc[:, :-1]
vali_y = vali_woe.iloc[:, -1]
x = model_woe.iloc[:, :-1]
y = model_woe.iloc[:, -1]
x.columns
Index(['age', 'RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfDependents'],
dtype='object')
age = vali_X.pop("age")
age
/Users/chenhao/anaconda3/envs/machine_learn/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: Index.ravel returning ndarray is deprecated; in a future version this will return a view on self.
output = repr(obj)
0 0.173920
1 0.999645
2 -0.292483
3 0.173920
4 -0.292483
...
189902 0.173920
189903 -0.292483
189904 -0.523395
189905 0.999645
189906 -0.523395
Name: age, Length: 189907, dtype: category
Categories (5, float64): [-0.523395 < -0.292483 < 0.173920 < 0.999645 < 1.676094]
vali_X.insert(0, column="age", value=age)
vali_X.columns
Index(['age', 'RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfDependents'],
dtype='object')
from sklearn.linear_model import LogisticRegression as LR
lr = LR().fit(x, y)
lr.score(vali_X, vali_y)
0.7838626274966167
from sklearn.model_selection import cross_val_score
c_1 = np.linspace(0.01, 1, 20)
c_2 = np.linspace(0.01, 0.2, 20)
score_list_1 = []
score_list_2 = []
for i in range(20):
lr1 = LR(penalty="l2", solver="liblinear", C=c_1[i])
lr1.fit(x, y)
score_list_1.append(cross_val_score(lr1, vali_X, vali_y, cv=10).mean())
lr2 = LR(penalty="l2", solver="liblinear", C=c_2[i])
lr2.fit(x, y)
score_list_2.append(cross_val_score(lr2, vali_X, vali_y, cv=10).mean())
plt.figure(figsize=[20, 5])
plt.plot(range(20), score_list_1, label="c_1")
plt.plot(range(20), score_list_2, label="c_2")
plt.legend()
plt.show()
import scikitplot as skplt
vali_proba_df = pd.DataFrame(lr.predict_proba(vali_X))
skplt.metrics.plot_roc(
vali_y, vali_proba_df, plot_micro=False, figsize=(6, 6), plot_macro=False
)
<AxesSubplot: title={'center': 'ROC Curves'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>
制作评分卡
B = 20 / np.log(2)
A = 600 + B * np.log(1 / 60)
B, A
(28.85390081777927, 481.8621880878296)
base_score = A - B * lr.intercept_
base_score
score_age = woeall["age"] * (-B * lr.coef_[0][0])
score_age
cut
(-inf, 36.0] -4.180160
(36.0, 52.0] -2.335948
(52.0, 61.0] 1.389030
(61.0, 73.0] 7.983784
(73.0, inf] 13.386325
dtype: float64
file = "./ScoreData.csv"
# 首先写入基准分数
# 之后使用循环,每次生成一组score_age类似的分档和分数,不断写入文件之中
with open(file, "w") as fdata:
fdata.write("base_score,{}\n".format(base_score))
for i, col in enumerate(x.columns):
score = woeall[col] * (-B * lr.coef_[0][i])
score.name = "Score"
score.index.name = col
score.to_csv(file, header=True, mode="a")