交易数据异常检测
1 from sklearn.linear_model import LogisticRegression 2 from sklearn.model_selection import KFold,cross_val_score 3 from sklearn.metrics import confusion_matrix,recall_score,classification_report 4 5 def printing_Kfold_scores(x_train_data,y_train_data): 6 fold = KFold(5,shuffle=False) 7 8 c_param_range = [0.01,0.1,1,10,100] 9 10 results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score']) 11 12 j = 0 13 for c_param in c_param_range: 14 print('------------------------------') 15 print('C parameter:', c_param) 16 print('------------------------------') 17 print('') 18 19 recall_accs = [] 20 for iteration, indices in enumerate(fold.split(x_train_data)): #交叉验证 21 #建立罗辑回归模型 22 lr = LogisticRegression(C = c_param, penalty='l1') 23 24 lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel()) 25 26 y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values) 27 28 recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample) 29 recall_accs.append(recall_acc) 30 print('Iteration ', iteration , ' :recall score= ', recall_acc) 31 32 results_table.loc[j,'Mean recall score'] = np.mean(recall_accs) 33 j += 1 34 print('') 35 print('Mean recall score', np.mean(recall_accs)) 36 print('') 37 38 best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter'] 39 print('*****************************************') 40 print('Best model to choose from cross validation is with C Paramter =', best_c) 41 print('*****************************************') 42 43 return best_c 44 best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)