代码改变世界

特征不同取值/区间下 label 的均值曲线

2018-03-15 14:03  xplorerthik  阅读(341)  评论(0编辑  收藏  举报

def two_plot(df, feat, tick_label=None, rotate_tick=60):
print('\n### 不同取值/区间下 label 的均值曲线')
fig, ax1 = plt.subplots()
proportions = df[feat].value_counts().sort_index() / df.shape[0]
x = proportions.index
y1 = proportions.values
y2 = df.groupby(feat).label.mean().sort_index().values
ax1.bar(range(len(x)), y1, color='b', tick_label=tick_label if tick_label else x)
feat_name = feat.split('_bins_code')[0]
#x_label = feat_comments[feat_name]
x_label = feat
ax1.set_xlabel(x_label)
ax1.set_ylabel('sample number ratio')
if rotate_tick:
for tick in ax1.get_xticklabels():
tick.set_rotation(90)

ax2 = ax1.twinx()
ax2.plot(y2, color='r')
ax2.set_ylabel('DPD30+ ratio')
plt.show()

 

 

overdue_df['label'] = overdue_df.max_overperiod_his.apply(lambda x: 0 if x<30 else 1)
feat = 'rev1m_messagetab_pv' #'rev6m_data05' #'rev3m_messagetab_pv'
bins = [-0.1,50, 200, 500, 1000, 5000, 10000, 15000] # 自己定义下分段
df[feat + '_bins'] = pd.cut(df[feat], bins=bins)

two_plot(overdue_df, feat + '_bins')