| from typing import Tuple |
| import numpy as np |
| from numpy.lib.function_base import average |
| import pandas as pd |
| import random |
| from sklearn.naive_bayes import GaussianNB |
| from sklearn.neighbors import KNeighborsClassifier |
| from openpyxl import Workbook |
| from openpyxl.utils.dataframe import dataframe_to_rows |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.ensemble import GradientBoostingClassifier |
| |
| """ the exp6 """ |
| |
| |
| def output_file(result_list, result_file, method): |
| ''' 输出预测结果 ''' |
| with open(result_file, "w") as fos: |
| result = "" |
| |
| for char in result_list: |
| result = result+(str(char)+'\n') |
| |
| result.strip() |
| |
| fos.write(result) |
| |
| |
| data_train = 'data-train.csv' |
| data_test = 'data-test.csv' |
| |
| clf_GNB = GaussianNB() |
| clf_KNN = KNeighborsClassifier() |
| clf_LR = LogisticRegression() |
| clf_MLP = MLPClassifier(max_iter=10000) |
| clf_RF = RandomForestClassifier() |
| clf_GB = GradientBoostingClassifier() |
| df_train_df = pd.read_csv(data_train, encoding='utf-8') |
| |
| x_arrays_train = np.array(df_train_df.iloc[:, :-1]) |
| |
| y_array_train = np.array(df_train_df.iloc[:, -1]) |
| |
| |
| def generate_random_bools(x_array: np.ndarray, estimate_scale: float = 90): |
| """ 通过随机化手段(将产生一系列的随机索引,方便对多组执行同样的随机选择(保持配套), |
| 这种做法相较于直接再数据容器(比如ndarray上直接抽取子集要来的灵活方便:引入第三方中介)) """ |
| size = len(x_array) |
| estimate_scale_int = int(size/100*estimate_scale) |
| real_scale = size-estimate_scale_int |
| true_list = [True for index in range(estimate_scale_int)] |
| false_list = [False for index in range(real_scale)] |
| bools = true_list+false_list |
| random.shuffle(bools) |
| return bools |
| |
| |
| def estimate_accuracy(bools: list[bool], x_array: np.ndarray, y_array: np.ndarray, clf=clf_GNB): |
| |
| |
| |
| |
| estimate_accuracy_x_train: np.ndarray = x_array[bools] |
| estimate_accuracy_y_train: np.ndarray = y_array[bools] |
| |
| clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train) |
| |
| bools_reverse = [not bool_ for bool_ in bools] |
| estimate_accuracy_x_test = x_array[bools_reverse] |
| real_result = y_array[bools_reverse] |
| |
| estimate_predict_result = clf.predict(estimate_accuracy_x_test) |
| |
| length = len(estimate_predict_result) |
| count = 0 |
| for label1, label2 in zip(estimate_predict_result, real_result): |
| if label1 == label2: |
| count += 1 |
| else: |
| ... |
| |
| accuracy = count/length |
| |
| |
| |
| return accuracy |
| |
| |
| def estimate_by_k_fold(index_tuple:Tuple, x_arrays_train: np.ndarray = x_arrays_train, |
| y_array_train: np.ndarray = y_array_train, clf=clf_GNB): |
| estimate_accuracy_x_train: np.ndarray = x_arrays_train[index_tuple[1]] |
| estimate_accuracy_y_train: np.ndarray = y_array_train[index_tuple[1]] |
| clf.fit(estimate_accuracy_x_train, estimate_accuracy_y_train) |
| real_result = y_array_train[index_tuple[0]] |
| estimate_accuracy_x_test = x_arrays_train[index_tuple[0]] |
| estimate_predict_result = clf.predict(estimate_accuracy_x_test) |
| |
| length = len(estimate_predict_result) |
| count = 0 |
| for label1, label2 in zip(estimate_predict_result, real_result): |
| if label1 == label2: |
| count += 1 |
| else: |
| ... |
| |
| accuracy = count/length |
| |
| |
| |
| return accuracy |
| |
| |
| def estimate_by_k_fold_times(index_tuples, clf=clf_GNB): |
| count_probiblity=0 |
| for index_tuple in index_tuples: |
| result = estimate_by_k_fold(index_tuple, clf=clf) |
| count_probiblity+=result |
| |
| |
| return count_probiblity/len(index_tuples) |
| |
| |
| |
| |
| """ 产生原问题的训练数据集x_array_train元素的索引构成的随机化序列,这些索引值构成一个索引序列列表index_list |
| k等分索引序列,得到k各区间,分别收集索引列表各区间的起始索引start_index和终止索引end_index; |
| range(start_index,end_index)产生的序列可以取得的index_list中的连续的若干个元素,按照这些元素可以取得x_array_train中的对应的一系列元素 |
| 这些元素将作为测试集;同时,range(0,start_index)和range(end_index,) |
| 然而,更好的描述是:对于分组start_index,end_index,对于index_list的切片[start_index,end_index],[:start_index]和[end_index:] |
| 两个切片之和作为对应的训练集(中元素的索引) |
| |
| """ |
| |
| |
| def get_random_indexes(x_arrays_train=x_arrays_train) -> list: |
| size = len(x_arrays_train) |
| indexes = [i for i in range(size)] |
| random.shuffle(indexes) |
| |
| return indexes |
| |
| |
| def get_k_fold_test_indexes(random_indexes:list[int], k=5) -> list: |
| test_index_tuples: list = [] |
| size=len(random_indexes) |
| array_size = size//k |
| for i in range(k): |
| array_start = i*array_size |
| array_end = array_start+array_size |
| |
| |
| |
| test_index_tuples.append( |
| (random_indexes[array_start: array_end], random_indexes[:array_start]+random_indexes[array_end:])) |
| return test_index_tuples |
| |
| |
| |
| |
| |
| |
| |
| def random_reservation(): |
| classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP] |
| sort_list = [] |
| times = 10 |
| count_prob = [0.0 for i in range(len(classifiers))] |
| for index in range(times): |
| bools = generate_random_bools(x_arrays_train, 90) |
| clf_index = 0 |
| for clf_i in classifiers: |
| count_prob[clf_index] += estimate_accuracy( |
| bools, x_arrays_train, y_array_train, clf=clf_i) |
| clf_index += 1 |
| |
| average_probs: np.ndarray = np.array(count_prob)/times |
| |
| |
| for clf_perform in zip(classifiers, average_probs): |
| print("in average result with:", clf_perform[0], clf_perform[1]) |
| sort_list.append(clf_perform) |
| |
| sort_list.sort(key=lambda tuple: tuple[1], reverse=True) |
| for item in sort_list: |
| print(item) |
| |
| def k_fold(): |
| classifiers: list = [clf_GNB, clf_KNN, clf_RF, clf_GB, clf_MLP] |
| sort_list = [] |
| times = 10 |
| count_prob = [0.0 for i in range(len(classifiers))] |
| for index in range(times): |
| index_tuples = get_k_fold_test_indexes(get_random_indexes(x_arrays_train)) |
| clf_index = 0 |
| for clf_i in classifiers: |
| |
| count_prob[clf_index] += estimate_by_k_fold_times( |
| index_tuples, clf=clf_i) |
| clf_index += 1 |
| |
| average_probs: np.ndarray = np.array(count_prob)/times |
| |
| |
| for clf_perform in zip(classifiers, average_probs): |
| print("in average result with:", clf_perform[0], clf_perform[1]) |
| sort_list.append(clf_perform) |
| |
| sort_list.sort(key=lambda tuple: tuple[1], reverse=True) |
| for item in sort_list: |
| print(item) |
| |
| if "__main__" == __name__: |
| k_fold() |
| |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
2021-09-26 web@浏览器浏览体验优化工具扩展@html5网页大纲生成工具@文档独立滚动大纲生成插件(headingsMap)@提取目录tableOfContent
2021-09-26 powershell@使用指南与入门命令