#
K-NN分类 import os import sys import time import operator import cx_Oracle import numpy as np import pandas as pd import tensorflow as tf conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr') cursor = conn.cursor() #获取数据集 def getdata(surgery,surgeryChest): sql = "select feature1,feature2,feature3,feature4,feature5,trainLable \ from menzhenZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest) cursor.execute(sql) rows = cursor.fetchall() dataset = [] lables = [] for row in rows: temp = [] temp.append(row[0]) temp.append(row[1]) temp.append(row[2]) temp.append(row[3]) temp.append(row[4]) dataset.append(temp) lables.append(row[5]) return np.array(dataset),np.array(lables) def gettestdata(surgery,surgeryChest): sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from \ testZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest) cursor.execute(sql) rows = cursor.fetchall() testdataset = [] testlables = [] for row in rows: temp = [] temp.append(row[0]) temp.append(row[1]) temp.append(row[2]) temp.append(row[3]) temp.append(row[4]) testdataset.append(temp) testlables.append(row[5]) return np.array(testdataset),np.array(testlables) #K-NN分类 def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] #归一化 def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = np.zeros(np.shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - np.tile(minVals, (m,1)) normDataSet = normDataSet/np.tile(ranges, (m,1)) return normDataSet, ranges, minVals erace = [] accuc = [] t = [] #启动和检测模型 def datingClassTest(): datingDataMat,datingLabels = getdata("外科","胸外科") normMat, ranges, minVals = autoNorm(datingDataMat) testdataset,testlables = gettestdata("外科","胸外科") testnormMat, testranges, testminVals = autoNorm(testdataset) errorCount = 0.0 start = time.time() for j in [3,5,7,9,11,13]: for i in range(np.shape(testnormMat)[0]): classifierResult = classify0(testnormMat[i,:],normMat,datingLabels,j) print("the classifier came back with: %s, the real answer is: %s" % (classifierResult, testlables[i])) if (classifierResult != testlables[i]): errorCount += 1.0 end = time.time() t.append(end) erace.append(errorCount/float(np.shape(testnormMat)[0])*100) accuc.append((1.0-errorCount/float(np.shape(testnormMat)[0]))*100) print("错误率: %.2f%%" % (errorCount/float(np.shape(testnormMat)[0])*100)) print("准确率: %.2f%%" % ((1.0-errorCount/float(np.shape(testnormMat)[0]))*100)) print("训练和预测一共耗时: %.2f 秒" % (end-start)) datingClassTest() print(accuc) print(erace) print(t)

 

#探索不同的K值对算法的影响

import matplotlib.pyplot as plt

x = [3,5,7,9,11,13]
plt.plot(x,erace,c='r')
plt.plot(x,accuc,c='g')
plt.legend(['error race','accuce race'],loc=9)
plt.show()
print(accuc)
print(erace)

#决策树
import os
import sys
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
from math import log
import tensorflow as tf

conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor()

#获取数据集
def getdata(surgery,surgeryChest):
    sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhenZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
    cursor.execute(sql)
    rows = cursor.fetchall()
    dataset = []
    for row in rows:
        temp = []
        temp.append(row[0])
        temp.append(row[1])
        temp.append(row[2])
        temp.append(row[3])
        temp.append(row[4])
        temp.append(row[5])
        dataset.append(temp)
    lables = []
    lables.append("呼吸急促")
    lables.append("持续性脉搏加快")
    lables.append("畏寒")
    lables.append("血压降低")
    lables.append("咳血")
    return dataset,lables

def gettestdata(surgery,surgeryChest):
    sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from testZ where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
    cursor.execute(sql)
    rows = cursor.fetchall()
    testdataset = []
    testlables = []
    for row in rows:
        temp = []
        temp.append(row[0])
        temp.append(row[1])
        temp.append(row[2])
        temp.append(row[3])
        temp.append(row[4])
        testdataset.append(temp)
        testlables.append(row[5])
    return testdataset,testlables

#计算熵值
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet: 
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannonEnt -= prob * log(prob,2) 
    return shannonEnt
    
#按照给定特征划分数据集
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]    
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

#选择最好的属性
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1     
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):       
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)       
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy    
        if (infoGain > bestInfoGain):      
            bestInfoGain = infoGain        
            bestFeature = i
    return bestFeature                     

#统计机制
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

#创建决策树
def createTree(dataSet,labels):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList): 
        return classList[0]
    if len(dataSet[0]) == 1: 
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    temp = []
    for i in labels:
        if i != labels[bestFeat]:
            temp.append(i)
    labels = temp
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]      
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree   

#使用决策树模型分类
def classify(inputTree,featLabels,testVec):
    for i in inputTree.keys():
        firstStr = i
        break
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict): 
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

#启动和检测模型
def datingClassTest():
    dataSet,labels = getdata("外科","胸外科")
    myTree = createTree(dataSet,labels)
    testdataset,testlables = gettestdata("外科","胸外科")
    errorCount = 0.0
    start = time.time()
    for i in range(np.shape(testdataset)[0]):
        classifierResult = classify(myTree,labels,testdataset[i])
        print("the classifier came back with: %s, the real answer is: %s" % (classifierResult, testlables[i]))
        if (classifierResult != testlables[i]): 
            errorCount += 1.0
    end = time.time()
    print("错误率: %.2f%%" % (errorCount/float(np.shape(testdataset)[0])*100))
    print("准确率: %.2f%%" % ((1.0-errorCount/float(np.shape(testdataset)[0]))*100))
    print("训练和预测一共耗时: %.2f 秒" % (end-start))
datingClassTest()

#选取前600条记录生成并打印决策树
dataSet,labels = getdata("外科","胸外科")
dataSet = dataSet[0:600]
labels = labels[0:600]
myTree = createTree(dataSet,labels)
print(myTree)

 

#比较K-NN算法与决策树算法的优劣
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x = np.array([10,12])
y = [85.6,87.3]
plt.bar(x,y,edgecolor='yellow')
for i,j in zip(x,y):
    plt.text(i-0.2,j-0.2,'%.2f%%' % j)
plt.text(9.7,40,'K-NN right race')
plt.text(11.7,40,'Tree right race')
plt.show()

#使用神经网络探索数据集
import sys
import os
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
import tensorflow as tf

conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor()

#one-hot编码
def onehot(labels):
    n_sample = len(labels)
    n_class = max(labels) + 1
    onehot_labels = np.zeros((n_sample, n_class))
    onehot_labels[np.arange(n_sample), labels] = 1
    return onehot_labels

#获取数据集
def getdata(surgery,surgeryChest):
    sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhen where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
    cursor.execute(sql)
    rows = cursor.fetchall()
    dataset = []
    lables = []
    for row in rows:
        temp = []
        temp.append(row[0])
        temp.append(row[1])
        temp.append(row[2])
        temp.append(row[3])
        temp.append(row[4])
        dataset.append(temp)
        if(row[5]==3):
            lables.append(0)
        elif(row[5]==6):
            lables.append(1)
        else:
            lables.append(2)
    dataset = np.array(dataset)
    lables = np.array(lables)
    dataset = dataset.astype(np.float32)
    labless = onehot(lables)
    return dataset,labless

#获取测试数据集
def gettestdata(surgery,surgeryChest):
    sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from test where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
    cursor.execute(sql)
    rows = cursor.fetchall()
    testdataset = []
    testlables = []
    for row in rows:
        temp = []
        temp.append(row[0])
        temp.append(row[1])
        temp.append(row[2])
        temp.append(row[3])
        temp.append(row[4])
        testdataset.append(temp)
        if(row[5]==3):
            testlables.append(0)
        elif(row[5]==6):
            testlables.append(1)
        else:
            testlables.append(2)
    testdataset = np.array(testdataset)
    testlables = np.array(testlables)
    testdataset = testdataset.astype(np.float32)
    testlabless = onehot(testlables)
    return testdataset,testlabless

dataset,labless = getdata("外科","胸外科")
testdataset,testlables = gettestdata("外科","胸外科")

dataset = dataset[0:100]
labless = labless[0:100]

x_data = tf.placeholder("float32", [None, 5])
y_data = tf.placeholder("float32", [None, 3])


weight = tf.Variable(tf.ones([5, 3]))
bias = tf.Variable(tf.ones([3]))

#使用softmax激活函数
y_model = tf.nn.softmax(tf.matmul(x_data, weight) + bias)

#y_model = tf.nn.relu(tf.matmul(x_data, weight) + bias)

# loss = tf.reduce_sum(tf.pow((y_model - y_data), 2))

#使用交叉熵作为损失函数
loss = -tf.reduce_sum(y_data*tf.log(y_model))

# train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(loss)

#使用AdamOptimizer优化器
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)


#train_step = tf.train.MomentumOptimizer(1e-4,0.9).minimize(loss)

#评估模型
correct_prediction = tf.equal(tf.argmax(y_model, 1), tf.argmax(y_data, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
start = time.time()
for _ in range(10):
    for i in range(int(len(dataset)/100)):
        sess.run(train_step, feed_dict={x_data:dataset[i:i+100,:], y_data:labless[i:i+100,:]})
print("模型准确率",sess.run(accuracy, feed_dict={x_data:testdataset , y_data:testlables}))
end = time.time()
print("模型训练和测试公耗时:%.2f 秒" % (end-start))

#加深一层神经网络
import sys
import os
import time
import operator
import cx_Oracle
import numpy as np
import pandas as pd
import tensorflow as tf

conn=cx_Oracle.connect('doctor/admin@localhost:1521/tszr')
cursor = conn.cursor()

#one-hot编码
def onehot(labels):
    n_sample = len(labels)
    n_class = max(labels) + 1
    onehot_labels = np.zeros((n_sample, n_class))
    onehot_labels[np.arange(n_sample), labels] = 1
    return onehot_labels

#获取数据集
def getdata(surgery,surgeryChest):
    sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from menzhen where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
    cursor.execute(sql)
    rows = cursor.fetchall()
    dataset = []
    lables = []
    for row in rows:
        temp = []
        temp.append(row[0])
        temp.append(row[1])
        temp.append(row[2])
        temp.append(row[3])
        temp.append(row[4])
        dataset.append(temp)
        if(row[5]==3):
            lables.append(0)
        elif(row[5]==6):
            lables.append(1)
        else:
            lables.append(2)
    dataset = np.array(dataset)
    lables = np.array(lables)
    dataset = dataset.astype(np.float32)
    labless = onehot(lables)
    return dataset,labless

def gettestdata(surgery,surgeryChest):
    sql = "select feature1,feature2,feature3,feature4,feature5,trainLable from test where surgery='%s' and surgeryChest='%s'" % (surgery,surgeryChest)
    cursor.execute(sql)
    rows = cursor.fetchall()
    testdataset = []
    testlables = []
    for row in rows:
        temp = []
        temp.append(row[0])
        temp.append(row[1])
        temp.append(row[2])
        temp.append(row[3])
        temp.append(row[4])
        testdataset.append(temp)
        if(row[5]==3):
            testlables.append(0)
        elif(row[5]==6):
            testlables.append(1)
        else:
            testlables.append(2)
    testdataset = np.array(testdataset)
    testlables = np.array(testlables)
    testdataset = testdataset.astype(np.float32)
    testlabless = onehot(testlables)
    return testdataset,testlabless

dataset,labless = getdata("外科","胸外科")
testdataset,testlables = gettestdata("外科","胸外科")

dataset = dataset[0:100]
labless = labless[0:100]

x_data = tf.placeholder("float32", [None, 5])
y_data = tf.placeholder("float32", [None, 3])

weight1 = tf.Variable(tf.ones([5, 20]))
bias1 = tf.Variable(tf.ones([20]))
y_model1 = tf.matmul(x_data, weight1) + bias1

#加深一层神经网络
weight2 = tf.Variable(tf.ones([20, 3]))
bias2 = tf.Variable(tf.ones([3]))
y_model = tf.nn.softmax(tf.matmul(y_model1, weight2) + bias2)

loss = tf.reduce_sum(tf.pow((y_model - y_data), 2))
# loss = -tf.reduce_sum(y_data*tf.log(y_model))

#train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(loss)
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
# train_step = tf.train.MomentumOptimizer(1e-4,0.9).minimize(loss)

correct_prediction = tf.equal(tf.argmax(y_model, 1), tf.argmax(y_data, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
start = time.time()
for _ in range(10):
    for i in range(int(len(dataset)/100)):
        sess.run(train_step, feed_dict={x_data:dataset[i:i+100,:], y_data:labless[i:i+100,:]})
print("模型准确率",sess.run(accuracy, feed_dict={x_data:testdataset , y_data:testlables}))
end = time.time()
print("模型训练和测试公耗时:%.2f 秒" % (end-start))

#比较决策树与神经网络的优劣
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x = np.array([10,12])
y = [87.1,87.4]
plt.bar(x,y,edgecolor="yellow")
for i,j in zip(x,y):
    plt.text(i-0.2,j-0.2,"%.2f%%" % j)
plt.text(9.7,40,"Tree right race")
plt.text(11.7,40,"Net right race")
plt.scatter([9.7,11.7],[0.05,0.36],c="r")
plt.plot([9.7,11.7],[0.05,0.36],c="g")
plt.show()

#统计各种算法处理模型数据
K-NN算法:
当K取[3,5,7,9,11,13]时,对应的:
准确率:[85.6, 72.6, 60.0, 47.4, 34.8, 22.299999999999996]
总耗时:[1554119134.435363, 1554119136.6192698, 
     1554119138.846019, 1554119141.2507513, 1554119143.4782736, 1554119145.5415804]

决策树:
准确率: 87.10%
训练和预测一共耗时: 0.05 秒
    
神经网络设计:
1 最小二乘法 softmax GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.162 最小二乘法 softmax AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.193 最小二乘法 softmax MomentumOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.184 最小二乘法 relu GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.175 最小二乘法 relu AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.156 最小二乘法 relu MomentumOptimizer 模型
模型准确率 0.006
模型训练和测试公耗时:0.197 交叉熵 softmax GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.098 交叉熵 softmax AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.089 交叉熵 softmax MomentumOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.0610 交叉熵 relu GradientDescentOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.0811 交叉熵 relu AdamOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.0812 交叉熵 relu MomentumOptimizer 模型
模型准确率 0.874
模型训练和测试公耗时:0.09 秒

从上面的12种神经网络设计模型中可以看出:最小二乘法 relu MomentumOptimizer 模型
的准确率只有0.006,所以这种模型的设计是失败的。

a = [0.874]*10
print(a)
#计算成功的各种神经网络模型的准确率与耗时的比值:
a = [0.874]*11
b = [0.16,0.19,0.18,0.17,0.15,0.09,0.08,0.06,0.08,0.09,0.09]
c = []
for i in range(len(a)):
    c.append(a[i]/b[i])
for i in range(len(c)):
    print("准确率与耗时的比值:%.4f" % (c[i]))

#K-NN算法
#当K取3、5、7、9、11、13时的准确率饼图分布显示
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

acc = [85.6, 72.6, 60.0, 47.4, 34.8, 22.2]
labels = ['K-3','K-5','K-7','K-9','K-11','K-13']
plt.pie(acc,labels=labels,shadow=True,startangle=90,autopct='%1.4f%%')
plt.axis('equal')
plt.title('K-NN',fontsize=25)
plt.show()

#K-NN算法耗时散点图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

x = np.array([1,2,3,4,5,6])
z = np.array([1554119134.435363, 1554119136.6192698,1554119138.846019,
              1554119141.2507513, 1554119143.4782736, 1554119145.5415804])
plt.scatter(x,z,c='g')
plt.xticks(x+0.4,['KNN-1','KNN-2','KNN-3','KNN-4','KNN-5','KNN-6'])
plt.show()

#神经网络算法对应各种有用的模型设计耗时曲线图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

x = np.array([1,2,3,4,5,6,7,8,9,10,11])
z = np.array([0.16,0.19,0.18,0.17,0.15,0.09,0.08,0.06,0.08,0.09,0.09])
plt.scatter(x,z,c='r')
plt.xticks(x+0.4,['NET-1','NET-2','NET-3','NET-4','NET-5',
                 'NET-6','NET-7','NET-8','NET-9','NET-10','NET-11'])
plt.show()

#K-NN、决策树以及神经网络算法对比
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

acc = [85.6, 72.6, 60.0, 47.4, 34.8, 22.2,87.10,0.874,
       87.4,87.4,87.4,87.4,87.4,87.4,87.4,87.4,87.4,87.4]
labels = ['K-3','K-5','K-7','K-9','K-11','K-13','TREE',
          'NET-1','NET-2','NET-3','NET-4','NET-5','NET-6','NET-7',
         'NET-8','NET-9','NET-10','NET-11']
explode = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.2,0,0,0]
plt.pie(acc,labels=labels,explode=explode,shadow=True,startangle=90,autopct='%1.4f%%')
plt.axis('equal')
plt.title('K-NN AND TREE AND NET',fontsize=25)
plt.show()