银行分控模型的建立

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import svm
# 读取数据
data_load = "bankloan.xls"
data = pd.read_excel(data_load)
data.describe()
年龄 教育 工龄 地址 收入 负债率 信用卡负债 其他负债 违约
count 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000 700.000000
mean 34.860000 1.722857 8.388571 8.278571 45.601429 10.260571 1.553553 3.058209 0.261429
std 7.997342 0.928206 6.658039 6.824877 36.814226 6.827234 2.117197 3.287555 0.439727
min 20.000000 1.000000 0.000000 0.000000 14.000000 0.400000 0.011696 0.045584 0.000000
25% 29.000000 1.000000 3.000000 3.000000 24.000000 5.000000 0.369059 1.044178 0.000000
50% 34.000000 1.000000 7.000000 7.000000 34.000000 8.600000 0.854869 1.987568 0.000000
75% 40.000000 2.000000 12.000000 12.000000 55.000000 14.125000 1.901955 3.923065 1.000000
max 56.000000 5.000000 31.000000 34.000000 446.000000 41.300000 20.561310 27.033600 1.000000
data.columns
Index(['年龄', '教育', '工龄', '地址', '收入', '负债率', '信用卡负债', '其他负债', '违约'], dtype='object')
data.index
RangeIndex(start=0, stop=700, step=1)
## 转为np 数据切割

X = np.array(data.iloc[:,0:-1])
y = np.array(data.iloc[:,-1])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, train_size=0.8, test_size=0.2, shuffle=True)

决策树

Dtree = DecisionTreeClassifier(max_leaf_nodes=3,random_state=13)
Dtree.fit(X_train, y_train)
y_pred = Dtree.predict(X_test)
accuracy_score(y_test, y_pred)
0.7285714285714285
tree.plot_tree(Dtree)
plt.show()

## seaborn

cm = confusion_matrix(y_test, y_pred)
heatmap = sns.heatmap(cm, annot=True, fmt='d')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
plt.ylabel("true label")
plt.xlabel("predict label")
plt.show()

SVM

svm = svm.SVC()
svm.fit(X_test,y_test)
y_pred = svm.predict(X_test)
accuracy_score(y_test, y_pred)
0.7857142857142857
cm = confusion_matrix(y_test, y_pred)
heatmap = sns.heatmap(cm, annot=True, fmt='d')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
plt.ylabel("true label")
plt.xlabel("predict label")
plt.show()

ensemble

# ensemble
from sklearn import svm
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf4 = svm.SVC()
clf5 = DecisionTreeClassifier(max_leaf_nodes=3,random_state=13)
eclf = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('svm', clf4), ('dtree', clf5)], voting='hard')
eclf = eclf.fit(X, y)
y_pred = eclf.predict(X_test)
accuracy_score(y_test, y_pred)
0.8357142857142857
cm = confusion_matrix(y_test, y_pred)
heatmap = sns.heatmap(cm, annot=True, fmt='d')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right')
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right')
plt.ylabel("true label")
plt.xlabel("predict label")
plt.show()

TORCH_BP_NETWORK

import torch
import torch.nn.functional as Fun
train_x = torch.FloatTensor(X_train)
train_y = torch.LongTensor(y_train)
test_x = torch.FloatTensor(X_test)
test_y = torch.LongTensor(y_test)
# BP
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # 定义隐藏层网络
        self.out = torch.nn.Linear(n_hidden, n_output)   # 定义输出层网络

    def forward(self, x):
        x = Fun.relu(self.hidden(x))      # 隐藏层的激活函数,采用relu,也可以采用sigmod,tanh
        x = self.out(x)                   # 输出层不用激活函数
        return x
net = Net(n_feature=8,n_hidden=20, n_output=2)    #n_feature:输入的特征维度,n_hiddenb:神经元个数,n_output:输出的类别个数
optimizer = torch.optim.SGD(net.parameters(), lr=0.02) # 优化器选用随机梯度下降方式
loss_func = torch.nn.CrossEntropyLoss() # 交叉熵损失函数
loss_record = []
for t in range(100):
    out = net(train_x)                 # 输入input,输出out
    loss = loss_func(out, train_y)     # 输出与label对比
    loss_record.append(loss.item())
    optimizer.zero_grad()   # 梯度清零
    loss.backward()         # 前馈操作
    optimizer.step()        # 使用梯度优化器
out = net(test_x) #out是一个计算矩阵,可以用Fun.softmax(out)转化为概率矩阵
prediction = torch.max(out, 1)[1] # 返回index  0返回原值
pred_y = prediction.data.numpy()
target_y = test_y.data.numpy()
accuracy = float((pred_y == target_y).astype(int).sum()) / float(target_y.size)
print(accuracy)
0.8214285714285714
##
ax = sns.lineplot(data = loss_record)
plt.show()

模型读取
torch.save(net, 'net.pt')
netC = torch.load('net.pt')
input = test_x
output = netC(input)
accuracy = float((pred_y == target_y).astype(int).sum()) / float(target_y.size)
print(accuracy)
0.8214285714285714

posted @ 2022-03-27 20:29  Leeingli  阅读(40)  评论(0编辑  收藏  举报