信用评分预测模型(九)--组合分类器
Author:LieDra
https://www.cnblogs.com/LieDra/
前言
下面将对数据利用组合分类器进行处理分析。
介绍
我们使用多专家组合的全局方法,构造并行的架构,对于给定的一个测试集输入,所有的基学习器都产生进行训练,并给出测试样本的输出,我们将各个输出都保存起来 以进行下一步判断。
我们选用了最常见的方法,即投票法,对于每个输出进行“求和”,即如果有三个及以上的基学习器得到的结果是同一类,那么最终的结果就是这一类。
代码示例
MyAPI.py
'''
version:
author:LieDra
Method:接口--输入非标准化数据或pca后数据
'''
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model.logistic import LogisticRegression
from sklearn.neural_network import MLPClassifier
# 只需要改get_data参数即可改变读取的文件
def get_data():
df=pd.read_excel('D:/study/5/code/python/python Data analysis and mining/class/dataset/german.xls') #原始数据
# df=pd.read_excel('D:/study/5/code/python/python Data analysis and mining/class/dataset/german-pca.xls') #pca降维后的数据
x=df.ix[:,:-1]
y=df.ix[:,-1]
return x,y
def list_add(a,b):
# 这个函数实现列表a与列表b相加,同时相加后的值存到列表a中
assert len(a)==len(b)
for i in range(len(a)):
a[i] += b[i]
def list_div(a,num):
# 这个函数将列表a的各值除以num,同时将处理后的值存到列表a中
for i in range(len(a)):
a[i] /= num
return a
def get_result_of_5classifiers(l1,l2,l3,l4,l5):
# tmp = list_add(l1,list_add(l2,list_add( l3,list_add(l4,l5) ) ) )
tmp = copy.deepcopy(l1)
list_add(tmp,l2)
list_add(tmp,l3)
list_add(tmp,l4)
list_add(tmp,l5)
# print(l1[:30])
# print(l2[:30])
# print(l3[:30])
# print(l4[:30])
# print(l5[:30])
# print(tmp[:30])
for i in range(len(tmp)):
if tmp[i]>2:
tmp[i]=1
else:
tmp[i]=0
return tmp
def get_acc_of_5classifiers(result,real):
tmp = [1 for i in range(len(result)) if result[i]==real[i]]
# print(tmp)
# print(len(tmp))
return sum(tmp)/len(real)
def get_score(result,real):
tmp = 0
for i in range(len(result)):
if result[i] == real[i]:
continue
elif result[i]==0:
tmp += 1
elif result[i]==1:
tmp += 5
return tmp
def AUC_performance(AUC):
'''
Logistic regression auc
'''
if AUC >=0.7:
print("good classifier")
if 0.7>AUC>0.6:
print("not very good classifier")
if 0.6>=AUC>0.5:
print("useless classifier")
if 0.5>=AUC:
print("bad classifier,with sorting problems")
def get_train_check(seed):
# x,y = get_data()
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=seed)
x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=seed)
return x_train,x_check,y_train,y_check
def get_decision_tree_classifier():
'''
不需要标准化数据
'''
# x,y = get_data()
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=47)
# x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=48)
best_tree = DecisionTreeClassifier(max_depth=3,random_state=88)
best_tree = best_tree.fit(x_train,y_train)
# accuracy_training=best_tree.score(x_train,y_train)
accuracy_test=best_tree.score(x_test,y_test)
print("decision tree:")
# print("accuracy on the training subset:{:.3f}".format(best_tree.score(x_train,y_train)))
print("accuracy on the check subset:{:.3f}".format(best_tree.score(x_check,y_check)))
return best_tree
def get_random_forest_classifier():
'''
不需要标准化数据
'''
trees = 10
# x,y = get_data()
# x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=594)
# x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=594)
forest=RandomForestClassifier(n_estimators=trees,random_state=38)
forest.fit(x_train,y_train)
# result_acc = 0
# result_acc += forest.score(x_check,y_check)
# for j in range(10):
# x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=j+1)
# print(forest.score(x_check,y_check),end=' ')
# # print(forest.predict(x_check))
# result_acc += forest.score(x_check,y_check)
# print()
# print("avg accuracy on the 11 check subsets:{:.3f}".format(result_acc/11))
return forest
def get_svm_classifier():
'''
需要标准化数据
'''
# x,y = get_data()
# x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=38)
# x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=38)
x_train_scaled = preprocessing.scale(x_train)
x_test_scaled = preprocessing.scale(x_check)
svm = SVC(random_state=66)
svm.fit(x_train_scaled,y_train)
print("accuracy on the scaled training subset:{:.3f}".format(svm.score(x_train_scaled,y_train)))
print("accuracy on the scaled test subset:{:.3f}".format(svm.score(x_test_scaled,y_check)))
return svm
def get_logistic_regression_classifier():
'''
不需要标准化数据
'''
# x,y = get_data()
# x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=754)
# x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=754)
log_classifier = LogisticRegression(random_state=99)
log_classifier.fit(x_train,y_train)
print("accuracy on the test subset:{:.3f}".format(log_classifier.score(x_check,y_check)))
return log_classifier
def get_mlp_classifier():
'''
需要标准化数据
'''
# x,y = get_data()
# x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=38)
# x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=38)
scaler=StandardScaler()
x_train_scaled=scaler.fit(x_train).transform(x_train)
x_check_scaled=scaler.fit(x_check).transform(x_check)
mlp_scaled=MLPClassifier(max_iter=100,random_state=99)
mlp_scaled.fit(x_train_scaled,y_train)
print("accuracy on the check subset:{:.3f}".format(mlp_scaled.score(x_check_scaled,y_check)))
return mlp_scaled
# x,y = get_data()
x,y = get_data()
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=38)
x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=38)
# x,y = get_data()
# x,x1,y,y1 = get_train_check(1)
# print(y)
# print(type(y))
# print(list(y))
# print(type(y))
# print(type(list(y)))
main.py
'''
version:
author:LieDra
Method:组合分类器
'''
# import numpy as np
# import matplotlib.pyplot as plt
from MyAPI import *
def main():
# 获得五个分类器
decision_tree_classifier = get_decision_tree_classifier()
random_forest_classifier = get_random_forest_classifier()
svm_classifier = get_svm_classifier()
logistic_regression_classifier = get_logistic_regression_classifier()
mlp_classifier = get_mlp_classifier()
# 测试组合效果
# x1,y1 = get_data('dataset\german.xls')
acc_list = [];score_list=[]
acc1_list = [];score1_list=[]
acc2_list = [];score2_list=[]
acc3_list = [];score3_list=[]
acc4_list = [];score4_list=[]
acc5_list = [];score5_list=[]
# x2,y2 = get_data('dataset\german-全标准化.xls')
for i in range(200):
print('第',i+1,'次',end=' ')
x_train,x_check,y_train,y_check=get_train_check(i+1)
# print('***'*5,111,'***'*5)
l1 = decision_tree_classifier.predict(x_check)
# print('***'*5,222,'***'*5)
l2 = random_forest_classifier.predict(x_check)
# print('***'*5,333,'***'*5)
x_check_scaled1 = preprocessing.scale(x_check)
l3 = svm_classifier.predict(x_check_scaled1)
# print('***'*5,444,'***'*5)
l4 = logistic_regression_classifier.predict(x_check)
# print('***'*5,555,'***'*5)
scaler=StandardScaler()
x_check_scaled2=scaler.fit(x_check).transform(x_check)
l5 = mlp_classifier.predict(x_check_scaled2)
result = get_result_of_5classifiers(l1,l2,l3,l4,l5)
acc = get_acc_of_5classifiers(result,list(y_check))
acc1 = get_acc_of_5classifiers(l1,list(y_check))
acc2 = get_acc_of_5classifiers(l2,list(y_check))
acc3 = get_acc_of_5classifiers(l3,list(y_check))
acc4 = get_acc_of_5classifiers(l4,list(y_check))
acc5 = get_acc_of_5classifiers(l5,list(y_check))
# print('acc:%.5f' % acc,end=' ')
print('acc ',acc)
print('acc:%.5f 决策树:%.5f 随机森林:%.5f SVM:%.5f 逻辑回归:%.5f MLP:%.5f'%(acc,acc1,acc2,acc3,acc4,acc5))
# print('score:%.5f 决策树:%.5f 随机森林:%.5f SVM:%.5f 逻辑回归:%.5f MLP:%.5f')
acc_list.append(acc)
acc1_list.append(acc1)
acc2_list.append(acc2)
acc3_list.append(acc3)
acc4_list.append(acc4)
acc5_list.append(acc5)
score_list.append(get_score(result,list(y_check)))
score1_list.append(get_score(l1,list(y_check)))
score2_list.append(get_score(l2,list(y_check)))
score3_list.append(get_score(l3,list(y_check)))
score4_list.append(get_score(l4,list(y_check)))
score5_list.append(get_score(l5,list(y_check)))
print('max acc:',max(acc_list))
print('avg acc:',sum(acc_list)/len(acc_list))
print('max acc:%.7f决策树:%.7f 随机森林:%.7f SVM:%.7f 逻辑回归:%.7f MLP:%.7f'%(\
max(acc_list)\
,max(acc1_list)\
,max(acc2_list)\
,max(acc3_list)\
,max(acc4_list)\
,max(acc5_list)))
print('avg acc:%.7f决策树:%.7f 随机森林:%.7f SVM:%.7f 逻辑回归:%.7f MLP:%.7f'%(\
sum(acc_list)/len(acc_list),\
sum(acc1_list)/len(acc1_list),\
sum(acc2_list)/len(acc2_list),\
sum(acc3_list)/len(acc3_list),\
sum(acc4_list)/len(acc4_list),\
sum(acc5_list)/len(acc5_list),\
))
print('avg score:%.7f决策树:%.7f 随机森林:%.7f SVM:%.7f 逻辑回归:%.7f MLP:%.7f'%(\
sum(score_list)/len(score_list),\
sum(score1_list)/len(score1_list),\
sum(score2_list)/len(score2_list),\
sum(score3_list)/len(score3_list),\
sum(score4_list)/len(score4_list),\
sum(score5_list)/len(score5_list),\
))
if __name__ == "__main__":
main()
结果
PCA前
···
第 100 次 acc 0.7533333333333333
acc:0.75333 决策树:0.68000 随机森林:0.70667 SVM:0.78000 逻辑回归:0.74000 MLP:0.79333
max acc: 0.7533333333333333
avg acc: 0.7117333333333333
max acc:0.7533333决策树:0.6933333 随机森林:0.7400000 SVM:0.7800000 逻辑回归:0.7466667 MLP:0.8333333
avg acc:0.7117333决策树:0.6534667 随机森林:0.7004000 SVM:0.7309333 逻辑回归:0.7030000 MLP:0.7694667
avg score:189.4000000决策树:238.8200000 随机森林:167.8600000 SVM:190.4000000 逻辑回归:192.8300000 MLP:144.5000000
···
第 200 次 acc 0.7
acc:0.70000 决策树:0.65333 随机森林:0.68667 SVM:0.73333 逻辑回归:0.70667 MLP:0.76000
max acc: 0.7533333333333333
avg acc: 0.7118666666666669
max acc:0.7533333决策树:0.7133333 随机森林:0.7400000 SVM:0.7800000 逻辑回归:0.7533333 MLP:0.8333333
avg acc:0.7118667决策树:0.6538667 随机森林:0.7000667 SVM:0.7316333 逻辑回归:0.7037000 MLP:0.7693333
avg score:189.2600000决策树:238.5800000 随机森林:168.5100000 SVM:189.6950000 逻辑回归:192.3450000 MLP:144.6400000
PCA后
第 100 次 acc 0.7533333333333333
acc:0.75333 决策树:0.72000 随机森林:0.68667 SVM:0.73333 逻辑回归:0.74667 MLP:0.74667
max acc: 0.76
avg acc: 0.7196666666666666
max acc:0.7600000决策树:0.7666667 随机森林:0.7200000 SVM:0.7466667 逻辑回归:0.7533333 MLP:0.7733333
avg acc:0.7196667决策树:0.6885333 随机森林:0.6675333 SVM:0.6994667 逻辑回归:0.7138000 MLP:0.7332667
avg score:177.0100000决策树:193.8400000 随机森林:189.4700000 SVM:194.1600000 逻辑回归:169.8900000 MLP:164.3700000
第 200 次 acc 0.7333333333333333
acc:0.73333 决策树:0.70000 随机森林:0.68000 SVM:0.70667 逻辑回归:0.72667 MLP:0.74000
max acc: 0.76
avg acc: 0.7205000000000003
max acc:0.7600000决策树:0.7666667 随机森林:0.7200000 SVM:0.7600000 逻辑回归:0.7666667 MLP:0.7800000
avg acc:0.7205000决策树:0.6878333 随机森林:0.6680333 SVM:0.7013333 逻辑回归:0.7137000 MLP:0.7341333
avg score:176.7650000决策树:194.7050000 随机森林:188.9350000 SVM:193.1800000 逻辑回归:169.9850000 MLP:163.8800000
其余省略
最终的测试结果将在(十)中展示。