2、逻辑回归
2.1常规但是要考虑样本均衡问题
import matplotlib.pyplot as plt
x=z.iloc[:,0:7]
y=z.iloc[:,7:]
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=1)
model = LogisticRegression(C=0.7)
model.fit(x_train, y_train)
p=model.predict_proba(x_test)
m=pd.DataFrame(p)
n=m.iloc[:,1:]
from sklearn.metrics import roc_curve
label=y_test
score=n
fpr,tpr,thresholds= roc_curve(label,score)
ks_value = max(abs(fpr-tpr))

画图,画出曲线

plt.plot(fpr, label='bad')
plt.plot(tpr, label='good')
plt.plot(abs(fpr-tpr), label='diff')

标记ks

x = np.argwhere(abs(fpr-tpr) == ks_value)[0, 0]
plt.plot((x, x), (0, ks_value), label='ks - {:.2f}'.format(ks_value), color='r', marker='o', markerfacecolor='r', markersize=5)
plt.scatter((x, x), (0, ks_value), color='r')
plt.legend()
plt.show()

网格找最佳参数
from sklearn.model_selection import GridSearchCV
model = LogisticRegression()
grid={'C':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,15,20,50,100]}
gs=GridSearchCV(model,grid,cv=10)
gs.fit(x_train,y_train)

gs.best_score_

gs.best_params_