使用神经网络做二分类预测

不想整理代码了。先给个结果图:

train 0 loss: 1838.0616
train 100 loss: 1441.5283
train 200 loss: 1299.4546
train 300 loss: 934.36536
train 400 loss: 506.06702
train 500 loss: 322.9782
train 600 loss: 271.5825
train 700 loss: 360.091
train 800 loss: 237.25177
train 900 loss: 332.97592
train 1000 loss: 117.5983
train 1100 loss: 173.39397
train 1200 loss: 51.26674
train 1300 loss: 82.82826
train 1400 loss: 74.705734
train 1500 loss: 113.63321
train 1600 loss: 71.29809
train 1700 loss: 38.41456
train 1800 loss: 82.75247
train 1900 loss: 44.553272
test 0,accuracy:0.953125,auc: (0.0, 0.9708618)
test 1,accuracy:0.9375,auc: (0.9708618, 0.96028894)
test 2,accuracy:0.9609375,auc: (0.96028894, 0.9594982)
test 3,accuracy:0.953125,auc: (0.9594982, 0.96195656)
test 4,accuracy:0.9375,auc: (0.96195656, 0.9627208)

loss这么大,结果这么准确。我也搞不懂是怎么肥事呀。

                                                     

AUC也没什么问题。暂时认为是好的吧。

下面是源码dataUtil用来对数据预处理:

import pandas as pd
import numpy as np
def load_csv(filename):
    data=pd.read_csv(filename)
    data = data.drop(data.columns[39:], axis=1)
    return data
def toInt(y):
    return int(y)
def split_x_y(data):
    keys=data.keys().tolist()
    y=data["Label"]
    keys.remove("Label")
    x=data[keys]
    return (x.values,y.values)
def max_min_nomalize(X):
    keys=X.keys().tolist()
    keys.remove("Index")
    keys.remove("Label")
    #keys.remove("Gender")
    #keys=["BMI","JiGan","ShouSuoYa","ShuZhangYa"]
    #删掉JiGan为-1的人
    #X = X[X["JiGan"].isin([-1.0]) == False]
    for key in keys:
        #normalize_col=(X[key]-(X[key].max()+X[key].min())/2)/(X[key].max()-X[key].min())
        #测试1:用mean来normolize
        normalize_col = (X[key] - X[key].mean()) / (X[key].max() - X[key].min())
        X = X.drop(key, axis=1)
        X[key]=normalize_col
    return X

if __name__=="__main__":
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 500)
    data=load_csv("./data/patient_data.csv")
    #print(data.head())
    print(data.info())
    print(data.describe())
    print(data.count())
    #print(data["Label"].value_counts())
    #data=data[data["JiGan"].isin([-1.0])==False]
   # print(data)
    #print(data)
    #print(data.describe())
    #x=max_min_nomalize(data)
    # for key in data.keys().tolist():
    #     print("********************************************************{}**********************************".format(key))
    #     print(data[key].value_counts())
    #data=load_csv("F:\workspaces\pycharm\Patient\data\patient_data.csv")
    #data=max_min_nomalize(data)
    #print(data.head())
dataUtil

然后是dnnModel用来构建模型:

import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from tensorflow.contrib import layers


class dnnModel():
    def __init__(self,x_train,y_train,x_test,y_test,learn_rate):

        self.epoch=0
        self.learn_rate=learn_rate
        self.h1_dimen=500
        self.h2_dimen=500
        self.load_data2(x_train,y_train,x_test,y_test)
        #self.load_data(x_train, y_train)
    def load_data2(self,x_train,y_train,x_test,y_test):
        self.x_datas=x_train
        self.y_datas=y_train
        self.x_datas_test=x_test
        self.y_datas_test=y_test
        self.num_datas=self.y_datas.shape[0]
        self.num_datas_test=self.y_datas_test.shape[0]
        self.input_dimen=self.x_datas.shape[1]
        self.output_dimen = self.y_datas.shape[1]
        self.shullf()
    def load_data(self,x,y):
        datas_len=x.shape[0]
        self.x_datas=x[0:datas_len*8//10]
        self.y_datas=y[0:datas_len*8//10]
        self.x_datas_test=x[datas_len*8//10:]
        self.y_datas_test=y[datas_len*8//10:]
        self.num_datas=self.y_datas.shape[0]
        self.num_datas_test=self.y_datas_test.shape[0]
        self.input_dimen=self.x_datas.shape[1]
        self.output_dimen = self.y_datas.shape[1]
        self.shullf()
        #self.output_dimen = 1
    def shullf(self):
        perm=np.arange(self.num_datas)
        np.random.shuffle(perm)
        self.x_datas=self.x_datas[perm]
        self.y_datas=self.y_datas[perm]
        perm=np.arange(self.num_datas_test)
        np.random.shuffle(perm)
        self.x_datas_test=self.x_datas_test[perm]
        self.y_datas_test=self.y_datas_test[perm]
    def weight_variable(self,shape,reg=True):
        init=tf.random_normal(shape=shape,dtype=tf.float32)
        if reg:
            if reg == True:
                regularizer = layers.l2_regularizer(0.05)
            else:
                regularizer = None
        return tf.Variable(init)
    def bias_variable(self,shape):
        init=tf.constant(0.1,dtype=tf.float32,shape=shape)
        return tf.Variable(init)
    def next_batch(self,batchsize):
        start=self.epoch
        self.epoch+=batchsize
        if self.epoch>self.num_datas:
            perm=np.arange(self.num_datas)
            np.random.shuffle(perm)
            self.x_datas=self.x_datas[perm]
            self.y_datas=self.y_datas[perm]
            self.epoch=batchsize
            start=0
        end=self.epoch
        return self.x_datas[start:end],self.y_datas[start:end]
    def add_layer(self,x,input_dimen,output_dimen,name,relu=True):
        with tf.name_scope(name):
            weight = self.weight_variable([input_dimen, output_dimen])
            bias = self.bias_variable([output_dimen])
            tf.summary.histogram(name+"/weight",weight)
            tf.summary.histogram(name+"/bias",bias)
            if relu:
                return tf.nn.relu(tf.matmul(x,weight)+bias)
            else:
                return tf.matmul(x,weight)+bias
    def constructDnn(self,input_x):
        #输入层
        input_layer=self.add_layer(input_x,self.input_dimen,500,name="input_layer",relu=True)
        #一个隐藏层
        h1=self.add_layer(input_layer,500,500,relu=True,name="hidden_layer1")
        h1_drop=tf.nn.dropout(h1,keep_prob=0.7)
        #在增加一个隐藏层
        h2=self.add_layer(h1_drop,500,1024,relu=True,name="hidden_layer2")
        h2_drop=tf.nn.dropout(h2,keep_prob=0.8)
        # 在增加一个隐藏层
        # h3 = self.add_layer(h2_drop, 500, 500, relu=True, name="hidden_layer2")
        # h3_drop = tf.nn.dropout(h3, keep_prob=0.8)
        #输出层
        output_layer=self.add_layer(h2_drop,1024,self.output_dimen,"output_layer",relu=False)
        tf.summary.histogram('/outputs', output_layer)
        return output_layer

    def train(self,maxTrainTimes,batchsize):
        X=tf.placeholder(dtype=tf.float32,shape=[None,self.input_dimen])
        Y=tf.placeholder(dtype=tf.float32,shape=[None,self.output_dimen])
        y_pre=self.constructDnn(X)
        entropy=tf.nn.softmax_cross_entropy_with_logits(logits=y_pre,labels=Y)
        #entropy=-tf.reduce_sum(Y*tf.log(tf.nn.softmax(y_pre)))
        loss = tf.reduce_mean(entropy)
        optimizer=tf.train.AdamOptimizer(self.learn_rate).minimize(loss)

        with tf.name_scope("evl"):
            correct=tf.equal(tf.argmax(y_pre,1),tf.argmax(Y,1))
            accuracy=tf.reduce_mean(tf.cast(correct,dtype=tf.float32))
            a = tf.cast(tf.argmax(y_pre, 1), tf.float32)
            b = tf.cast(tf.argmax(Y, 1), tf.float32)
            auc = tf.contrib.metrics.streaming_auc(a, b)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("accuracy", accuracy)
        #tf.summary.scalar("auc", auc)
        merged_summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter('./tmp/mnist_logs')
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.initialize_local_variables())
            summary_writer.add_graph(sess.graph)
            for i in range(maxTrainTimes):
                x_train,y_train=self.next_batch(batchsize)
                sess.run(optimizer,feed_dict={X:x_train,Y:y_train})
                #print(sess.run(y_pre,feed_dict={X:x_train,Y:y_train}))
                #print(sess.run(entropy, feed_dict={X: x_train, Y: y_train}))
                if i%100==0:
                    print("train {} loss:".format(i),sess.run(loss,feed_dict={X:x_train,Y:y_train}))
                    s = sess.run(merged_summary_op, feed_dict={X:x_train,Y:y_train})
                    summary_writer.add_summary(s, i)
            testTime=self.num_datas_test//batchsize
            for i in range(testTime):
                x_train, y_train = self.next_batch(batchsize)
                testAcc=sess.run(accuracy, feed_dict={X: x_train, Y: y_train})
                testAuc=sess.run(auc,feed_dict={X: x_train, Y: y_train})
                y_pred_pro = sess.run(y_pre,feed_dict={X: x_train, Y: y_train})
                y_scores = np.array(y_pred_pro)
                auc_value = roc_auc_score(y_train, y_scores)
                a=np.array(y_train)[:,1]
                b=y_scores[:,1]
                fpr, tpr, thresholds = roc_curve(a,b , pos_label=1.0)
                plt.figure(figsize=(6, 4))
                plt.plot(fpr, tpr, color='blue', linewidth=2, label='AUC (area:%0.4f)' % auc_value)
                plt.plot([0, 1], [0, 1], color='black', linewidth=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.0])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('ROC')
                plt.legend(loc="lower right")
                plt.show()
                print("test {},accuracy:{},auc: {}".format(i,testAcc,testAuc))
    def svm_train(self,maxTrainTimes,batchsize):
        # 初始化feedin
        x_data = tf.placeholder(shape=[None, self.input_dimen], dtype=tf.float32)
        y_target = tf.placeholder(shape=[None,1], dtype=tf.float32)

        # 创建变量
        A = tf.Variable(tf.random_normal(shape=[self.input_dimen, 1]))
        b = tf.Variable(tf.random_normal(shape=[1, 1]))

        # 定义线性模型
        model_output = tf.subtract(tf.matmul(x_data, A), b)

        # Declare vector L2 'norm' function squared
        l2_norm = tf.reduce_sum(tf.square(A))

        # Loss = max(0, 1-pred*actual) + alpha * L2_norm(A)^2
        alpha = tf.constant([0.01])
        classification_term = tf.reduce_mean(tf.maximum(0., tf.subtract(1., tf.multiply(model_output, y_target))))
        loss = tf.add(classification_term, tf.multiply(alpha, l2_norm))
        my_opt = tf.train.GradientDescentOptimizer(0.01)
        train_step = my_opt.minimize(loss)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for i in range(maxTrainTimes):
                train_x,train_y=self.next_batch(batchsize)
                train_y=train_y.reshape([-1,1])
                sess.run(train_step, feed_dict={x_data: train_x, y_target: train_y})
                if i%100==0:
                    print("loss in train step {}: {}".format(i,sess.run(loss,feed_dict={x_data: train_x, y_target: train_y})))
dnnModel

最后是程序入口,读取数据,喂给神经网络:

import dataUtil
import dnnModel
import tensorflow as tf
import numpy as np
import numpy as np
import tensorflow as tf


def one_hot(labels,class_num):
    labels=changeToint(labels)
    b = tf.one_hot(labels, class_num, 1, 0)
    with tf.Session() as sess:
        return sess.run(b)
def changeToint(list):
    a=range(len(list))
    for i in range(len(list)):
        #print("now in {},and total is {}".format(i,len(list)))
        if (int(list[i]))==0 :
            list[i]=int(0)
        else:
            list[i]=int(1)
        #print(i,list[i])
    return list
def select_x_y(x,y):
    #return x,y,x,y
    #选择2 4的数据
    x_train_selected=[]
    y_train_selected=[]
    x_test_selected=[]
    y_test_selected=[]
    for i in range(x.shape[0]):
        item=x[i]
        if item[0]>20000 and item[0]<30000:
            x_train_selected.append(x[i,1:])
            y_train_selected.append(y[i])
        elif item[0]>40000:
            x_train_selected.append(x[i,1:])
            y_train_selected.append(y[i])
        else:
            x_test_selected.append(x[i,1:])
            y_test_selected.append(y[i])
    return np.array(x_train_selected),np.array(y_train_selected),np.array(x_test_selected),np.array(y_test_selected)

if __name__=="__main__":
    data=dataUtil.load_csv("F:\workspaces\pycharm\Patient\data\patient_data.csv")
    #for key in data.keys().tolist():
    # print(data.info())
    # print(data["ShuZhangYa"].count())
    x,y=dataUtil.split_x_y(dataUtil.max_min_nomalize(data))
    y=one_hot(y,2)
    #y=changeToint(y).reshape([-1,1])
    #print(y)
    x_train,y_train,x_test,y_test=select_x_y(x,y)
    #from tensorflow.examples.tutorials.mnist import input_data
    #MNIST = input_data.read_data_sets("mnist", one_hot=True)
    #mydnn=dnnModel.dnnModel(MNIST.train.images,MNIST.train.labels,0.1)
    #mydnn = dnnModel.dnnModel(x,changeToint(y).reshape([-1,1]),0.1)
    mydnn = dnnModel.dnnModel(x_train,y_train,x_test,y_test, 0.001)
    #mydnn.svm_train(1000,50)
    mydnn.train(2000,128)
main

 

posted @ 2018-08-27 19:35  超级学渣渣  阅读(6881)  评论(0编辑  收藏  举报