1.测试集验证集合
library(ggplot2)
library(caret)
seed(123456) 设置随机数种子
prop.table(table(xxx字段)
index<-createDataParatition
xunlianji<- credit[index,]
ceshiji<- credit[-index]
prop.table(table(xunlianji))
prop.table(table(ceshiji))
dim(xunlianji)
dim(ceshiji)
2.建模分析
fit<- gim(y~.,data= xunlianji,family="xxx")
summary
看分析结果 修正字段
fit1<- gim(y~age+xxx字段.,data= xunlianji,family="xxx")
summary(fit1)
模型修正
as.factor(xunlianji)
SMOTE(y~.,xunlianji,perc.over = 500, perc.under=100)
prop.table(table(xxx训练字段))
模型评估
install.packages("pROC", repo=" tsinghua.eduxxx")
library(pROC)
pre <- predit(fit1,ceshiji)
modelroc <- roc(xxx测试字段,pre)
modelroc 输出
plot(modelroc,print.auc=TRUE, auc.polygon=TRUE,grid=c(0.1,0.2), grid.col=c('GREEN','RED'),max.auc.polygoon=TRUE,auc.polygon.col="skyblue",print.thres=TRUE)
建立评分卡
library(sqldf)
library(gsubfn)
library(smbinning)
对数据分箱
par(mar=C(5,4,2,3))
如取AGE 字段分箱
age <- smbinning(table,"y","xxx字段")
age$iv 查看字段IV
par(mfrow=c(2,2))
smbinning.plot(age,option="dist", sub="名称")
smbinning.plot(age,option="WOE", sub="名称")
smbinning.plot(age,option="goodrate", sub="名称")
smbinning.plot(age,option="badrate", sub="名称")
par(mfrow=c(1,1))
age$iv 查看字段IV
xxx<-table
xxx<-smbinning.gen(xxx,字段,"名称")
xxx<-smbinning.gen(xxx,字段,"名称")
xxx<-smbinning.gen(xxx,字段,"名称")
xxx<-smbinning.gen(xxx,字段,"名称")
head(xxx)
查看 要生成的新列 xxx_new <-xxx[,c(1,11:18)]
head(xxx_new)
xxx_mod<-glm(y~., data=xxx_new,family=binomial())
summary(xxx_new)
打分
cre_scal <-smbinning.scaliing(cred_mod,pdo=45,score=800,odds=50)
cre_scal$minmaxscore
cre_scal$logitscaled
对每行生成对应的分值
xxx4<-smbinning.scoring.gen(smbscaled=xxx_scal,dataset=xxx_new)
view(xxx4)
boxplot(score~y,data=xxx4,horizontal=T,fram=F,col="lightgry",main="distrbution")
分类器的性能和比较·
smbinning.metrics(xxx4,"score","y",plot="auc")
一般银行都会根据分值决定贷款阈值 ,是否符合 数据量对企业风险大不大。