sklearn_分类预测(KNN,GNB,...)示例(疾病简单二分类)


数据即和最新源代码link

code

from typing import Tuple
import numpy as np
from numpy.lib.function_base import average
import pandas as pd
import random
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
""" the exp6 """
def output_file(result_list, result_file, method):
''' 输出预测结果 '''
with open(result_file, "w") as fos:
result = ""
# print(str(result_list))
for char in result_list:
result = result+(str(char)+'\n')
# print(result)
result.strip()
# fos.write(method+str(result_list))
fos.write(result)
data_train = 'data-train.csv'
data_test = 'data-test.csv'
# gnb predicion:
clf_GNB = GaussianNB()
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()
clf_MLP = MLPClassifier(max_iter=10000) # max_iter=1000最高迭代1000次
clf_RF = RandomForestClassifier()
clf_GB = GradientBoostingClassifier()
df_train_df = pd.read_csv(data_train, encoding='utf-8')
# 取得本征数据集条目
x_arrays_train = np.array(df_train_df.iloc[:, :-1])
# 取得对应的标签集
y_array_train = np.array(df_train_df.iloc[:, -1])
def generate_random_bools(x_array: np.ndarray, estimate_scale: float = 90):
""" 通过随机化手段(将产生一系列的随机索引,方便对多组执行同样的随机选择(保持配套),
这种做法相较于直接再数据容器(比如ndarray上直接抽取子集要来的灵活方便:引入第三方中介)) """
size = len(x_array)
estimate_scale_int = int(size/100*estimate_scale)
real_scale = size-estimate_scale_int
true_list = [True for index in range(estimate_scale_int)]
false_list = [False for index in range(real_scale)]
bools = true_list+false_list
random.shuffle(bools)
return bools
def estimate_accuracy(bools: list[bool], x_array: np.ndarray, y_array: np.ndarray, clf=clf_GNB):
# 读取原始数据,指定UTF-8编码(需要用文本编辑器将数据装换为UTF-8编码)
# print(bools)
estimate_accuracy_x_train: np.ndarray = x_array[bools]
estimate_accuracy_y_train: np.ndarray = y_array[bools]
# fit the modle(classifier)
clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train)
# get the data set to be predict(estimate)
bools_reverse = [not bool_ for bool_ in bools]
estimate_accuracy_x_test = x_array[bools_reverse]
real_result = y_array[bools_reverse]
estimate_predict_result = clf.predict(estimate_accuracy_x_test)
# calculate the accuracy:
length = len(estimate_predict_result)
count = 0
for label1, label2 in zip(estimate_predict_result, real_result):
if label1 == label2:
count += 1
else:
...
# print the len to certain the result is calculate the proper case:
accuracy = count/length
# print(accuracy)
# print(length, "elements were predicted with model:", clf)
return accuracy
def estimate_by_k_fold(index_tuple:Tuple, x_arrays_train: np.ndarray = x_arrays_train,
y_array_train: np.ndarray = y_array_train, clf=clf_GNB):
estimate_accuracy_x_train: np.ndarray = x_arrays_train[index_tuple[1]]
estimate_accuracy_y_train: np.ndarray = y_array_train[index_tuple[1]]
clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train)
real_result = y_array_train[index_tuple[0]]
estimate_accuracy_x_test = x_arrays_train[index_tuple[0]]
estimate_predict_result = clf.predict(estimate_accuracy_x_test)
# calculate the accuracy:
length = len(estimate_predict_result)
count = 0
for label1, label2 in zip(estimate_predict_result, real_result):
if label1 == label2:
count += 1
else:
...
# print the len to certain the result is calculate the proper case:
accuracy = count/length
# print(accuracy)
# print(length, "elements were predicted with model:", clf)
return accuracy
def estimate_by_k_fold_times(index_tuples, clf=clf_GNB):
count_probiblity=0
for index_tuple in index_tuples:
result = estimate_by_k_fold(index_tuple, clf=clf)
count_probiblity+=result
# print(result)
# 返回平均值
return count_probiblity/len(index_tuples)
# defind series of k_fold related method:
""" 产生原问题的训练数据集x_array_train元素的索引构成的随机化序列,这些索引值构成一个索引序列列表index_list
k等分索引序列,得到k各区间,分别收集索引列表各区间的起始索引start_index和终止索引end_index;
range(start_index,end_index)产生的序列可以取得的index_list中的连续的若干个元素,按照这些元素可以取得x_array_train中的对应的一系列元素
这些元素将作为测试集;同时,range(0,start_index)和range(end_index,)
然而,更好的描述是:对于分组start_index,end_index,对于index_list的切片[start_index,end_index],[:start_index]和[end_index:]
两个切片之和作为对应的训练集(中元素的索引)
"""
def get_random_indexes(x_arrays_train=x_arrays_train) -> list:
size = len(x_arrays_train)
indexes = [i for i in range(size)]
random.shuffle(indexes)
# print(np.ndarray(indexes))
return indexes
def get_k_fold_test_indexes(random_indexes:list[int], k=5) -> list:
test_index_tuples: list = []
size=len(random_indexes)
array_size = size//k
for i in range(k):
array_start = i*array_size
array_end = array_start+array_size
# 左闭右开
# test_index_tuples.append(indexs[array_start:array_end])
# test_index_tuples.append((array_start,array_end))
test_index_tuples.append(
(random_indexes[array_start: array_end], random_indexes[:array_start]+random_indexes[array_end:]))
return test_index_tuples
# def get_k_fold_train_indexes(test_index_tuples, length, k=10) -> list:
# for tuple in test_index_tuples:
# def k_fold_estimate(x_array_train, k=10):
def random_reservation():
classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP]
sort_list = []
times = 10
count_prob = [0.0 for i in range(len(classifiers))]
for index in range(times):
bools = generate_random_bools(x_arrays_train, 90)
clf_index = 0
for clf_i in classifiers:
count_prob[clf_index] += estimate_accuracy(
bools, x_arrays_train, y_array_train, clf=clf_i)
clf_index += 1
average_probs: np.ndarray = np.array(count_prob)/times
# clf_index=0
for clf_perform in zip(classifiers, average_probs):
print("in average result with:", clf_perform[0], clf_perform[1])
sort_list.append(clf_perform)
# print(sort_list)
sort_list.sort(key=lambda tuple: tuple[1], reverse=True)
for item in sort_list:
print(item)
def k_fold():
classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP]
sort_list = []
times = 10
count_prob = [0.0 for i in range(len(classifiers))]
for index in range(times):
index_tuples = get_k_fold_test_indexes(get_random_indexes(x_arrays_train))
clf_index = 0
for clf_i in classifiers:
count_prob[clf_index] += estimate_by_k_fold_times(
index_tuples, clf=clf_i)
clf_index += 1
average_probs: np.ndarray = np.array(count_prob)/times
# clf_index=0
for clf_perform in zip(classifiers, average_probs):
print("in average result with:", clf_perform[0], clf_perform[1])
sort_list.append(clf_perform)
# print(sort_list)
sort_list.sort(key=lambda tuple: tuple[1], reverse=True)
for item in sort_list:
print(item)
if "__main__" == __name__:
k_fold()

code (initial version)

def exp6():
from re import T
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import pandas as pd
import numpy as np
prefix = "./exp6/"
data_train = 'data-train.csv' #
data_test = 'data-test.csv'
data_train = prefix+data_train
data_test = prefix+data_test
prediction_result = prefix+'prediction.txt' #
# 读取原始数据,指定UTF-8编码(需要用文本编辑器将数据装换为UTF-8编码)
""" # print(data_table)
# print(data_table.loc[0:3])
# print("\n"*3)
# list=np.array(data_table.iloc[:,:]).tolist()
# print(list[0])
# lists=np.array(data_table.iloc[:,:-1]).tolist()
# print(lists[0])
# print(list[0]) """
df_train = pd.read_csv(data_train, encoding='utf-8')
x_arrays = np.array(df_train.iloc[:, :-1])
y_array = np.array(df_train['target'])
# print(x_arrays,"\n",y_array)
df_test = pd.read_csv(data_test, encoding='utf-8')
newx = np.array(df_test.iloc[:, :-1])
KNN_exp6(x_arrays,y_array,newx,prediction_result)
# GNB_exp6(x_arrays,y_array,newx,prediction_result)
# print("predict_GNB:",clf.predict((newx)))
# print("probability_GNB:",clf.predict_proba(newx))
def KNN_exp6(x,y,newx,prediction_result):
# the default n_neightbors=5
clf=KNeighborsClassifier(n_neighbors=7)
clf.fit(x,y)
result_list=clf.predict(newx)
output_file(result_list, prediction_result,"KNN")
def GNB_exp6(x,y,newx,prediction_result):
clf= clf = GaussianNB()
clf.fit(x, y)
result_list=clf.predict(newx)
output_file(result_list, prediction_result,"GNB")
exp6()

预测分类(二分类0/1)

data_train.csv:

在这里插入图片描述

data_test.csv:

在这里插入图片描述

posted @   xuchaoxin1375  阅读(9)  评论(0编辑  收藏  举报  
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
历史上的今天:
2021-09-26 web@浏览器浏览体验优化工具扩展@html5网页大纲生成工具@文档独立滚动大纲生成插件(headingsMap)@提取目录tableOfContent
2021-09-26 powershell@使用指南与入门命令
点击右上角即可分享
微信分享提示