
get the train data:
https://github.com/xuchaoxin1375/learnPython
python code:
| from typing import Iterable |
| from sklearn.neighbors import KNeighborsClassifier |
| from sklearn.naive_bayes import GaussianNB |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.ensemble import GradientBoostingClassifier |
| import numpy as np |
| import random |
| ''' 本程序采用python3的注解,标记出变量/函数的类型,提高可读性 ''' |
| |
| |
| def get_percents(protein: str) -> list[float]: |
| ''' |
| 计算蛋白质序列上各种氨基酸占该氨基酸的比例,以此提取特征值做归一化处理 |
| according the protein to calculate the percentes: ''' |
| aa20: tuple = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', |
| 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V') |
| result_list: list[float] = [] |
| protein_len: int = len(protein) |
| |
| for amino in aa20: |
| |
| |
| percent: float = protein.count(amino)/protein_len |
| result_list.append(percent) |
| return result_list |
| |
| |
| def get_protein_sequences1(file: str) -> list[str]: |
| ''' get protein sequences from file ''' |
| sequences: list[str] = [] |
| with open(file, "r") as file_input_stream: |
| |
| for line in file_input_stream: |
| |
| line = line.split(" ") |
| sequences.append(line[2].strip()) |
| return sequences |
| |
| |
| def get_protein_sequences2(file: str) -> list[str]: |
| ''' get protein sequences from file2 ''' |
| sequences: list[str] = [] |
| with open(file, "r") as file_input_stream: |
| for line in file_input_stream: |
| line = line.split(" ") |
| sequences.append(line[1].strip()) |
| return sequences |
| |
| |
| def get_protein_labels(file: str): |
| ''' get labels of each protein from file ''' |
| labels: list[int] = [] |
| with open(file, "r") as file_input_stream: |
| for line in file_input_stream: |
| |
| line = line.split(" ") |
| labels.append(int(line[1])) |
| return labels |
| |
| |
| def output_file(result_iterable: Iterable, result_file: str, classifier=""): |
| with open(result_file, "w") as fos: |
| result = "" |
| |
| |
| print(classifier) |
| for char in result_iterable: |
| result = result+(str(char)+'\n') |
| print(result) |
| result.strip() |
| fos.write(result) |
| |
| |
| prefix = "D:/OneDrive - pop.zjgsu.edu.cn/PythonPath/exp7/" |
| ProSeqs_Test = prefix+"ProSeqs_Test.txt" |
| ProSeqs_Train = prefix+"ProSeqs_Train.txt" |
| x_list: list[str] = get_protein_sequences1(ProSeqs_Train) |
| y_list: list[int] = get_protein_labels(ProSeqs_Train) |
| x_percents = [get_percents(protein) for protein in x_list] |
| x_list_test = get_protein_sequences2(ProSeqs_Test) |
| x_percents_test = [get_percents(protein) for protein in x_list_test] |
| """ get the numerical data set and corresponding labels: """ |
| x_array: np.ndarray = np.array( |
| x_percents) |
| y_array: np.ndarray = np.array(y_list) |
| x_array_test: np.ndarray = np.array(x_percents_test) |
| |
| |
| |
| clf_GNB = GaussianNB() |
| clf_KNN = KNeighborsClassifier() |
| clf_LR = LogisticRegression() |
| clf_MLP = MLPClassifier() |
| clf_RF = RandomForestClassifier() |
| clf_GB = GradientBoostingClassifier() |
| |
| |
| def estimate_accuracy(x_array: np.ndarray, y_array: np.ndarray, estimate_scale: float, clf): |
| """ 通过随机化手段(将产生一系列的随机索引,方便对多组执行同样的随机选择(保持配套), |
| 这种做法相较于直接再数据容器(比如ndarray上直接抽取子集要来的灵活方便:引入第三方中介)) """ |
| size = len(x_array) |
| estimate_scale_int = int(size/100*estimate_scale) |
| real_scale = size-estimate_scale_int |
| true_list = [True for index in range(estimate_scale_int)] |
| false_list = [False for index in range(real_scale)] |
| bools = true_list+false_list |
| random.shuffle(bools) |
| |
| |
| estimate_accuracy_x: np.ndarray = x_array[bools] |
| estimate_accuracy_y: np.ndarray = y_array[bools] |
| |
| clf.fit(estimate_accuracy_x, estimate_accuracy_y) |
| |
| bools_reverse = [not bool_ for bool_ in bools] |
| estimate_accuracy_x_test = x_array[bools_reverse] |
| real_result = y_array[bools_reverse] |
| |
| |
| |
| estimate_predict_result = clf.predict(estimate_accuracy_x_test) |
| |
| ''' the GNB will be expecting has the 80% accuracy or so: ''' |
| length = len(estimate_predict_result) |
| count = 0 |
| for label1, label2 in zip(estimate_predict_result, real_result): |
| if label1 == label2: |
| count += 1 |
| else: |
| |
| pass |
| |
| accuracy = count/length |
| |
| |
| |
| return accuracy |
| |
| |
| def get_average_accuracy(clf=clf_GNB, times: int = 10, estimate_scale=95): |
| count_probility = 0 |
| count = times |
| while count: |
| count_probility += estimate_accuracy(x_array, |
| y_array, estimate_scale, clf) |
| count -= 1 |
| return count_probility/times |
| |
| """ 将结果写入文件: """ |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if __name__ == "__main__": |
| |
| |
| classifiers: list = [clf_GNB, clf_KNN, clf_LR, clf_RF, clf_GB] |
| sort_list = [] |
| for clf in classifiers: |
| result = get_average_accuracy(times=10,estimate_scale=98,clf=clf) |
| print("in average result with:", clf, result) |
| sort_list.append((result, clf)) |
| |
| sort_list.sort(key=lambda tuple: tuple[0], reverse=True) |
| for item in sort_list: |
| print(item) |
| |
| |
python code(initial version):
| from sklearn.neighbors import KNeighborsClassifier |
| from sklearn.naive_bayes import GaussianNB |
| import numpy as np |
| |
| |
| def get_percents(protein): |
| ''' according the protein to calculate the percentes: ''' |
| aa20 = ('A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', |
| 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V') |
| result_list = [] |
| protein_len = len(protein) |
| for amino in aa20: |
| |
| |
| percent = protein.count(amino)/protein_len |
| result_list.append(percent) |
| return result_list |
| |
| |
| def get_protein_sequences1(file): |
| ''' get protein sequences from file ''' |
| sequences = [] |
| with open(file, "r") as file_input_stream: |
| for line in file_input_stream: |
| line = line.split(" ") |
| sequences.append(line[2].strip()) |
| return sequences |
| |
| def get_protein_sequences2(file): |
| ''' get protein sequences from file ''' |
| sequences = [] |
| with open(file, "r") as file_input_stream: |
| for line in file_input_stream: |
| line = line.split(" ") |
| sequences.append(line[1].strip()) |
| return sequences |
| |
| def get_protein_labels(file): |
| ''' get labels of each protein from file ''' |
| labels = [] |
| with open(file, "r") as file_input_stream: |
| for line in file_input_stream: |
| |
| line = line.split(" ") |
| labels.append(int(line[1])) |
| return labels |
| |
| |
| def output_file(result_list, result_file, classifier=""): |
| with open(result_file, "w") as fos: |
| result = "" |
| |
| |
| for char in result_list: |
| result = result+(str(char)+'\n') |
| print(result) |
| result.strip() |
| fos.write(result) |
| |
| |
| prefix = "D:/OneDrive - pop.zjgsu.edu.cn/PythonPath/exp7/" |
| ProSeqs_Test = prefix+"ProSeqs_Test.txt" |
| ProSeqs_Train = prefix+"ProSeqs_Train.txt" |
| x_list = get_protein_sequences1(ProSeqs_Train) |
| y_list = get_protein_labels(ProSeqs_Train) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| x_percents = [get_percents(protein) for protein in x_list] |
| |
| |
| |
| x_array = np.array(x_percents) |
| y_array = np.array(y_list) |
| |
| |
| |
| |
| |
| |
| |
| x_list_test = get_protein_sequences2(ProSeqs_Test) |
| x_percents_test = [get_percents(protein) for protein in x_list_test] |
| x_array_test = np.array(x_percents_test) |
| |
| |
| clf = GaussianNB() |
| |
| |
| def estimate_accuracy(x_array,y_array,sample_num=1500): |
| sample_num=1500 |
| estimate_accuracy_x=x_array[:sample_num] |
| estimate_accuracy_y=y_array[:sample_num] |
| |
| |
| |
| |
| clf.fit(estimate_accuracy_x,estimate_accuracy_y) |
| |
| estimate_accuracy_x_test=x_array[sample_num:] |
| real_result=y_array[sample_num:] |
| |
| estimate_result=clf.predict(estimate_accuracy_x_test) |
| |
| ''' the GNB will be expecting has the 80% accuracy or so: ''' |
| len=len(estimate_result) |
| count=0 |
| for label1,label2 in zip(estimate_result,real_result): |
| if label1==label2: |
| count+=1 |
| else : |
| pass |
| |
| |
| |
| print(count/len,len,"elements were predicted") |
| |
| |
| |
| clf.fit(x_array, y_array) |
| result_list = clf.predict(x_array_test) |
| print(result_list) |
| prediction_result = prefix+"preds.txt" |
| output_file(result_list, prediction_result) |
| |
result:

【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
2023-08-24 LA@非齐次线性方程组解的结构
2023-08-24 LA@齐次线性方程组解的结构
2023-08-24 LA@0线性方程组的解摘要@记号说明
2023-08-24 LA@向量组间的表示关系