sklearn_分类预测(KNN,GNB,...)示例(疾病简单二分类)

文章目录

code
code (initial version)
预测分类(二分类0/1)
- data_train.csv:
- data_test.csv:

code

 from typing import Tuple
import numpy as np
from numpy.lib.function_base import average
import pandas as pd
import random
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
 
""" the exp6 """
 
 
def output_file(result_list, result_file, method):
    ''' 输出预测结果 '''
    with open(result_file, "w") as fos:
        result = ""
        # print(str(result_list))
        for char in result_list:
            result = result+(str(char)+'\n')
        # print(result)
        result.strip()
        # fos.write(method+str(result_list))
        fos.write(result)
 
 
data_train = 'data-train.csv'
data_test = 'data-test.csv'
# gnb predicion:
clf_GNB = GaussianNB()
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()
clf_MLP = MLPClassifier(max_iter=10000)  # max_iter=1000最高迭代1000次
clf_RF = RandomForestClassifier()
clf_GB = GradientBoostingClassifier()
df_train_df = pd.read_csv(data_train, encoding='utf-8')
# 取得本征数据集条目
x_arrays_train = np.array(df_train_df.iloc[:, :-1])
# 取得对应的标签集
y_array_train = np.array(df_train_df.iloc[:, -1])
 
 
def generate_random_bools(x_array: np.ndarray, estimate_scale: float = 90):
    """ 通过随机化手段(将产生一系列的随机索引,方便对多组执行同样的随机选择(保持配套),
    这种做法相较于直接再数据容器(比如ndarray上直接抽取子集要来的灵活方便:引入第三方中介)) """
    size = len(x_array)
    estimate_scale_int = int(size/100*estimate_scale)
    real_scale = size-estimate_scale_int
    true_list = [True for index in range(estimate_scale_int)]
    false_list = [False for index in range(real_scale)]
    bools = true_list+false_list
    random.shuffle(bools)
    return bools
 
 
def estimate_accuracy(bools: list[bool], x_array: np.ndarray, y_array: np.ndarray, clf=clf_GNB):
 
    # 读取原始数据，指定UTF-8编码（需要用文本编辑器将数据装换为UTF-8编码）
    # print(bools)
 
    estimate_accuracy_x_train: np.ndarray = x_array[bools]
    estimate_accuracy_y_train: np.ndarray = y_array[bools]
    # fit the modle(classifier)
    clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train)
    # get the data set to be predict(estimate)
    bools_reverse = [not bool_ for bool_ in bools]
    estimate_accuracy_x_test = x_array[bools_reverse]
    real_result = y_array[bools_reverse]
 
    estimate_predict_result = clf.predict(estimate_accuracy_x_test)
    # calculate the accuracy:
    length = len(estimate_predict_result)
    count = 0
    for label1, label2 in zip(estimate_predict_result, real_result):
        if label1 == label2:
            count += 1
        else:
            ...
    # print the len to certain the result is calculate the proper case:
    accuracy = count/length
 
    # print(accuracy)
    # print(length, "elements were predicted with model:", clf)
    return accuracy
 
 
def estimate_by_k_fold(index_tuple:Tuple, x_arrays_train: np.ndarray = x_arrays_train,
                       y_array_train: np.ndarray = y_array_train, clf=clf_GNB):
    estimate_accuracy_x_train: np.ndarray = x_arrays_train[index_tuple[1]]
    estimate_accuracy_y_train: np.ndarray = y_array_train[index_tuple[1]]
    clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train)
    real_result = y_array_train[index_tuple[0]]
    estimate_accuracy_x_test = x_arrays_train[index_tuple[0]]
    estimate_predict_result = clf.predict(estimate_accuracy_x_test)
    # calculate the accuracy:
    length = len(estimate_predict_result)
    count = 0
    for label1, label2 in zip(estimate_predict_result, real_result):
        if label1 == label2:
            count += 1
        else:
            ...
    # print the len to certain the result is calculate the proper case:
    accuracy = count/length
 
    # print(accuracy)
    # print(length, "elements were predicted with model:", clf)
    return accuracy
 
 
def estimate_by_k_fold_times(index_tuples, clf=clf_GNB):
    count_probiblity=0
    for index_tuple in index_tuples:
        result = estimate_by_k_fold(index_tuple, clf=clf)
        count_probiblity+=result
        # print(result)
        # 返回平均值
    return count_probiblity/len(index_tuples)
    
 
 
# defind  series of k_fold related method:
""" 产生原问题的训练数据集x_array_train元素的索引构成的随机化序列,这些索引值构成一个索引序列列表index_list
k等分索引序列,得到k各区间,分别收集索引列表各区间的起始索引start_index和终止索引end_index;
range(start_index,end_index)产生的序列可以取得的index_list中的连续的若干个元素,按照这些元素可以取得x_array_train中的对应的一系列元素
这些元素将作为测试集;同时,range(0,start_index)和range(end_index,)
然而,更好的描述是:对于分组start_index,end_index,对于index_list的切片[start_index,end_index],[:start_index]和[end_index:]
两个切片之和作为对应的训练集(中元素的索引)
 
 """
 
 
def get_random_indexes(x_arrays_train=x_arrays_train) -> list:
    size = len(x_arrays_train)
    indexes = [i for i in range(size)]
    random.shuffle(indexes)
    # print(np.ndarray(indexes))
    return indexes
 
 
def get_k_fold_test_indexes(random_indexes:list[int], k=5) -> list:
    test_index_tuples: list = []
    size=len(random_indexes)
    array_size = size//k
    for i in range(k):
        array_start = i*array_size
        array_end = array_start+array_size
        # 左闭右开
        # test_index_tuples.append(indexs[array_start:array_end])
        # test_index_tuples.append((array_start,array_end))
        test_index_tuples.append(
            (random_indexes[array_start: array_end], random_indexes[:array_start]+random_indexes[array_end:]))
    return test_index_tuples
 
 
# def get_k_fold_train_indexes(test_index_tuples, length, k=10) -> list:
#     for tuple in test_index_tuples:
# def k_fold_estimate(x_array_train, k=10):
 
 
def random_reservation():
    classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP]
    sort_list = []
    times = 10
    count_prob = [0.0 for i in range(len(classifiers))]
    for index in range(times):
        bools = generate_random_bools(x_arrays_train, 90)
        clf_index = 0
        for clf_i in classifiers:
            count_prob[clf_index] += estimate_accuracy(
                bools, x_arrays_train, y_array_train, clf=clf_i)
            clf_index += 1
            
    average_probs: np.ndarray = np.array(count_prob)/times
 
    # clf_index=0
    for clf_perform in zip(classifiers, average_probs):
        print("in average result with:", clf_perform[0], clf_perform[1])
        sort_list.append(clf_perform)
    # print(sort_list)
    sort_list.sort(key=lambda tuple: tuple[1], reverse=True)
    for item in sort_list:
        print(item)
 
def k_fold():
    classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP]
    sort_list = []
    times = 10
    count_prob = [0.0 for i in range(len(classifiers))]
    for index in range(times):
        index_tuples = get_k_fold_test_indexes(get_random_indexes(x_arrays_train))
        clf_index = 0
        for clf_i in classifiers:
            
            count_prob[clf_index] += estimate_by_k_fold_times(
                index_tuples, clf=clf_i)
            clf_index += 1
            
    average_probs: np.ndarray = np.array(count_prob)/times
 
    # clf_index=0
    for clf_perform in zip(classifiers, average_probs):
        print("in average result with:", clf_perform[0], clf_perform[1])
        sort_list.append(clf_perform)
    # print(sort_list)
    sort_list.sort(key=lambda tuple: tuple[1], reverse=True)
    for item in sort_list:
        print(item)
 
if "__main__" == __name__:
    k_fold()

code (initial version)

 def exp6():
    from re import T
    from openpyxl import Workbook
    from openpyxl.utils.dataframe import dataframe_to_rows
    import pandas as pd
    import numpy as np
    prefix = "./exp6/"
    data_train = 'data-train.csv'  # 
    data_test = 'data-test.csv'
 
    data_train = prefix+data_train
    data_test = prefix+data_test
    prediction_result = prefix+'prediction.txt'  # 
 
    # 读取原始数据，指定UTF-8编码（需要用文本编辑器将数据装换为UTF-8编码）
    """   # print(data_table)
        # print(data_table.loc[0:3])
        # print("\n"*3)
        # list=np.array(data_table.iloc[:,:]).tolist()
        # print(list[0])
        # lists=np.array(data_table.iloc[:,:-1]).tolist()
        # print(lists[0])
        # print(list[0]) """
    df_train = pd.read_csv(data_train, encoding='utf-8')
    x_arrays = np.array(df_train.iloc[:, :-1])
    y_array = np.array(df_train['target'])
    
    # print(x_arrays,"\n",y_array)
   
    
    df_test = pd.read_csv(data_test, encoding='utf-8')
    newx = np.array(df_test.iloc[:, :-1])
 
    KNN_exp6(x_arrays,y_array,newx,prediction_result)
    # GNB_exp6(x_arrays,y_array,newx,prediction_result)
 
    
    # print("predict_GNB:",clf.predict((newx)))
    # print("probability_GNB:",clf.predict_proba(newx))
 
def KNN_exp6(x,y,newx,prediction_result):
    # the default n_neightbors=5
    clf=KNeighborsClassifier(n_neighbors=7)
    clf.fit(x,y)
    result_list=clf.predict(newx)
    output_file(result_list, prediction_result,"KNN")
def GNB_exp6(x,y,newx,prediction_result):
    clf= clf = GaussianNB()
    clf.fit(x, y)
    result_list=clf.predict(newx)
    output_file(result_list, prediction_result,"GNB")
    
exp6()

预测分类(二分类0/1)

data_train.csv:

在这里插入图片描述

data_test.csv:

在这里插入图片描述

posted @ 2022-09-26 11:09 xuchaoxin1375 阅读(9) 评论(0) 编辑收藏举报来源

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· python_简单蛋白质功能二分类预测(sklearn:GNB)

· python_sklearn预测真假新闻(pandas读入两份csv文件)

· Sklearn-机器学习应用实用指南-全-

· sklearn使用SGD/Random Forest多分类识别手写数字

· KNN _ K近邻算法的实现 ----- 机器学习

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

历史上的今天：
2021-09-26 web@浏览器浏览体验优化工具扩展@html5网页大纲生成工具@文档独立滚动大纲生成插件(headingsMap)@提取目录tableOfContent
2021-09-26 powershell@使用指南与入门命令

公告

昵称： xuchaoxin1375
园龄： 4年10个月
粉丝： 1
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

xuchaoxin1375

sklearn_分类预测(KNN,GNB,...)示例(疾病简单二分类)

文章目录

code

code (initial version)

预测分类(二分类0/1)

data_train.csv:

data_test.csv:

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

	from typing import Tuple
	import numpy as np
	from numpy.lib.function_base import average
	import pandas as pd
	import random
	from sklearn.naive_bayes import GaussianNB
	from sklearn.neighbors import KNeighborsClassifier
	from openpyxl import Workbook
	from openpyxl.utils.dataframe import dataframe_to_rows
	from sklearn.linear_model import LogisticRegression
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.ensemble import GradientBoostingClassifier

	""" the exp6 """


	def output_file(result_list, result_file, method):
	''' 输出预测结果 '''
	with open(result_file, "w") as fos:
	result = ""
	# print(str(result_list))
	for char in result_list:
	result = result+(str(char)+'\n')
	# print(result)
	result.strip()
	# fos.write(method+str(result_list))
	fos.write(result)


	data_train = 'data-train.csv'
	data_test = 'data-test.csv'
	# gnb predicion:
	clf_GNB = GaussianNB()
	clf_KNN = KNeighborsClassifier()
	clf_LR = LogisticRegression()
	clf_MLP = MLPClassifier(max_iter=10000) # max_iter=1000最高迭代1000次
	clf_RF = RandomForestClassifier()
	clf_GB = GradientBoostingClassifier()
	df_train_df = pd.read_csv(data_train, encoding='utf-8')
	# 取得本征数据集条目
	x_arrays_train = np.array(df_train_df.iloc[:, :-1])
	# 取得对应的标签集
	y_array_train = np.array(df_train_df.iloc[:, -1])


	def generate_random_bools(x_array: np.ndarray, estimate_scale: float = 90):
	""" 通过随机化手段(将产生一系列的随机索引,方便对多组执行同样的随机选择(保持配套),
	这种做法相较于直接再数据容器(比如ndarray上直接抽取子集要来的灵活方便:引入第三方中介)) """
	size = len(x_array)
	estimate_scale_int = int(size/100*estimate_scale)
	real_scale = size-estimate_scale_int
	true_list = [True for index in range(estimate_scale_int)]
	false_list = [False for index in range(real_scale)]
	bools = true_list+false_list
	random.shuffle(bools)
	return bools


	def estimate_accuracy(bools: list[bool], x_array: np.ndarray, y_array: np.ndarray, clf=clf_GNB):

	# 读取原始数据，指定UTF-8编码（需要用文本编辑器将数据装换为UTF-8编码）
	# print(bools)

	estimate_accuracy_x_train: np.ndarray = x_array[bools]
	estimate_accuracy_y_train: np.ndarray = y_array[bools]
	# fit the modle(classifier)
	clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train)
	# get the data set to be predict(estimate)
	bools_reverse = [not bool_ for bool_ in bools]
	estimate_accuracy_x_test = x_array[bools_reverse]
	real_result = y_array[bools_reverse]

	estimate_predict_result = clf.predict(estimate_accuracy_x_test)
	# calculate the accuracy:
	length = len(estimate_predict_result)
	count = 0
	for label1, label2 in zip(estimate_predict_result, real_result):
	if label1 == label2:
	count += 1
	else:
	...
	# print the len to certain the result is calculate the proper case:
	accuracy = count/length

	# print(accuracy)
	# print(length, "elements were predicted with model:", clf)
	return accuracy


	def estimate_by_k_fold(index_tuple:Tuple, x_arrays_train: np.ndarray = x_arrays_train,
	y_array_train: np.ndarray = y_array_train, clf=clf_GNB):
	estimate_accuracy_x_train: np.ndarray = x_arrays_train[index_tuple[1]]
	estimate_accuracy_y_train: np.ndarray = y_array_train[index_tuple[1]]
	clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train)
	real_result = y_array_train[index_tuple[0]]
	estimate_accuracy_x_test = x_arrays_train[index_tuple[0]]
	estimate_predict_result = clf.predict(estimate_accuracy_x_test)
	# calculate the accuracy:
	length = len(estimate_predict_result)
	count = 0
	for label1, label2 in zip(estimate_predict_result, real_result):
	if label1 == label2:
	count += 1
	else:
	...
	# print the len to certain the result is calculate the proper case:
	accuracy = count/length

	# print(accuracy)
	# print(length, "elements were predicted with model:", clf)
	return accuracy


	def estimate_by_k_fold_times(index_tuples, clf=clf_GNB):
	count_probiblity=0
	for index_tuple in index_tuples:
	result = estimate_by_k_fold(index_tuple, clf=clf)
	count_probiblity+=result
	# print(result)
	# 返回平均值
	return count_probiblity/len(index_tuples)



	# defind series of k_fold related method:
	""" 产生原问题的训练数据集x_array_train元素的索引构成的随机化序列,这些索引值构成一个索引序列列表index_list
	k等分索引序列,得到k各区间,分别收集索引列表各区间的起始索引start_index和终止索引end_index;
	range(start_index,end_index)产生的序列可以取得的index_list中的连续的若干个元素,按照这些元素可以取得x_array_train中的对应的一系列元素
	这些元素将作为测试集;同时,range(0,start_index)和range(end_index,)
	然而,更好的描述是:对于分组start_index,end_index,对于index_list的切片[start_index,end_index],[:start_index]和[end_index:]
	两个切片之和作为对应的训练集(中元素的索引)

	"""


	def get_random_indexes(x_arrays_train=x_arrays_train) -> list:
	size = len(x_arrays_train)
	indexes = [i for i in range(size)]
	random.shuffle(indexes)
	# print(np.ndarray(indexes))
	return indexes


	def get_k_fold_test_indexes(random_indexes:list[int], k=5) -> list:
	test_index_tuples: list = []
	size=len(random_indexes)
	array_size = size//k
	for i in range(k):
	array_start = i*array_size
	array_end = array_start+array_size
	# 左闭右开
	# test_index_tuples.append(indexs[array_start:array_end])
	# test_index_tuples.append((array_start,array_end))
	test_index_tuples.append(
	(random_indexes[array_start: array_end], random_indexes[:array_start]+random_indexes[array_end:]))
	return test_index_tuples


	# def get_k_fold_train_indexes(test_index_tuples, length, k=10) -> list:
	# for tuple in test_index_tuples:
	# def k_fold_estimate(x_array_train, k=10):


	def random_reservation():
	classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP]
	sort_list = []
	times = 10
	count_prob = [0.0 for i in range(len(classifiers))]
	for index in range(times):
	bools = generate_random_bools(x_arrays_train, 90)
	clf_index = 0
	for clf_i in classifiers:
	count_prob[clf_index] += estimate_accuracy(
	bools, x_arrays_train, y_array_train, clf=clf_i)
	clf_index += 1

	average_probs: np.ndarray = np.array(count_prob)/times

	# clf_index=0
	for clf_perform in zip(classifiers, average_probs):
	print("in average result with:", clf_perform[0], clf_perform[1])
	sort_list.append(clf_perform)
	# print(sort_list)
	sort_list.sort(key=lambda tuple: tuple[1], reverse=True)
	for item in sort_list:
	print(item)

	def k_fold():
	classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP]
	sort_list = []
	times = 10
	count_prob = [0.0 for i in range(len(classifiers))]
	for index in range(times):
	index_tuples = get_k_fold_test_indexes(get_random_indexes(x_arrays_train))
	clf_index = 0
	for clf_i in classifiers:

	count_prob[clf_index] += estimate_by_k_fold_times(
	index_tuples, clf=clf_i)
	clf_index += 1

	average_probs: np.ndarray = np.array(count_prob)/times

	# clf_index=0
	for clf_perform in zip(classifiers, average_probs):
	print("in average result with:", clf_perform[0], clf_perform[1])
	sort_list.append(clf_perform)
	# print(sort_list)
	sort_list.sort(key=lambda tuple: tuple[1], reverse=True)
	for item in sort_list:
	print(item)

	if "__main__" == __name__:
	k_fold()

	def exp6():
	from re import T
	from openpyxl import Workbook
	from openpyxl.utils.dataframe import dataframe_to_rows
	import pandas as pd
	import numpy as np
	prefix = "./exp6/"
	data_train = 'data-train.csv' #
	data_test = 'data-test.csv'

	data_train = prefix+data_train
	data_test = prefix+data_test
	prediction_result = prefix+'prediction.txt' #

	# 读取原始数据，指定UTF-8编码（需要用文本编辑器将数据装换为UTF-8编码）
	""" # print(data_table)
	# print(data_table.loc[0:3])
	# print("\n"*3)
	# list=np.array(data_table.iloc[:,:]).tolist()
	# print(list[0])
	# lists=np.array(data_table.iloc[:,:-1]).tolist()
	# print(lists[0])
	# print(list[0]) """
	df_train = pd.read_csv(data_train, encoding='utf-8')
	x_arrays = np.array(df_train.iloc[:, :-1])
	y_array = np.array(df_train['target'])

	# print(x_arrays,"\n",y_array)


	df_test = pd.read_csv(data_test, encoding='utf-8')
	newx = np.array(df_test.iloc[:, :-1])

	KNN_exp6(x_arrays,y_array,newx,prediction_result)
	# GNB_exp6(x_arrays,y_array,newx,prediction_result)


	# print("predict_GNB:",clf.predict((newx)))
	# print("probability_GNB:",clf.predict_proba(newx))

	def KNN_exp6(x,y,newx,prediction_result):
	# the default n_neightbors=5
	clf=KNeighborsClassifier(n_neighbors=7)
	clf.fit(x,y)
	result_list=clf.predict(newx)
	output_file(result_list, prediction_result,"KNN")
	def GNB_exp6(x,y,newx,prediction_result):
	clf= clf = GaussianNB()
	clf.fit(x, y)
	result_list=clf.predict(newx)
	output_file(result_list, prediction_result,"GNB")

	exp6()