2024/05/04(2024春季)
学习时长:3小时
代码行数:219
博客数量:1篇
今天我的高中同学委托给我了一个任务,说是替他完成就请我吃东西。
是一个有关于他们学校的一个深度学习的实验,要求用到python。
然后我就去网上搜索,问ai,把要求的内容全拼到一起来,我也不知道写了个什么东西。
# 导入相关包 import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn import datasets # 导入数据集 data = "iris/data/iris.csv" iris_local = pd.read_csv(data, usecols=[0, 1, 2, 3, 4]) iris_local = iris_local.dropna() # 丢弃含空值的行、列 iris_local.head() iris_local.info() # 载入特征和标签集 X = iris_local[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']] Y = iris_local['variety'] # 对标签集进行编码 from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() Y = encoder.fit_transform(Y) print(Y) #训练数据 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.3, random_state=0)#30%为测试数据集 print("X_train shape: {}".format(X_train.shape)) # X_train shape: (112, 4) print("y_train shape: {}".format(y_train.shape)) # y_train shape: (112,) #平均值、标准差、最小值、四分之一位数、中位数等统计信息 print(iris_local.describe()) #绘制箱线图 iris_local.plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False) #绘制直方图 iris_local.hist() #绘制kde、核密度 iris_local.plot(kind='kde') iris_local.plot() # 绘制 Violinplot f, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True) sns.despine(left=True) # 设置颜色主题 antV = ['#1890FF', '#2FC25B', '#FACC14', '#223273', '#8543E0', '#13C2C2', '#3436c7', '#F04864'] sns.violinplot(x='variety', y='sepal.length', data=iris_local, palette=antV, ax=axes[(0, 0)]) sns.violinplot(x='variety', y='sepal.width', data=iris_local, palette=antV, ax=axes[(0, 1)]) sns.violinplot(x='variety', y='petal.length', data=iris_local, palette=antV, ax=axes[(1, 0)]) sns.violinplot(x='variety', y='petal.width', data=iris_local, palette=antV, ax=axes[(1, 1)]) plt.show() # 绘制 pointplot f, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True) sns.despine(left=True) antV = ['#1890FF', '#2FC25B', '#FACC14', '#223273', '#8543E0', '#13C2C2', '#3436c7', '#F04864'] sns.pointplot(x='variety', y='sepal.length', data=iris_local, color=antV[0], ax=axes[0, 0]) sns.pointplot(x='variety', y='sepal.width', data=iris_local, color=antV[0], ax=axes[0, 1]) sns.pointplot(x='variety', y='petal.length', data=iris_local, color=antV[0], ax=axes[1, 0]) sns.pointplot(x='variety', y='petal.width', data=iris_local,color=antV[0], ax=axes[1, 1]) plt.show() #生成各特征之间关系的矩阵图 g = sns.pairplot(data=iris_local, palette=antV, hue= 'variety') # 使用 Andrews Curves 将每个多变量观测值转换为曲线并表示傅立叶级数的系数,这对于检测时间序列数据中的异常值很有用。 plt.subplots(figsize = (10,8)) pd.plotting.andrews_curves(iris_local, 'variety', colormap='cool') plt.show() #平行坐标 pd.plotting.parallel_coordinates(iris_local, 'variety', colormap = 'brg') #下面分别基于花萼和花瓣做线性回归的可视化: g = sns.lmplot(data=iris_local, x='sepal.width', y='sepal.length', palette=antV, hue='variety') g = sns.lmplot(data=iris_local, x='petal.width', y='petal.length', palette=antV, hue='variety') plt.show() #构建模型(k 近邻) from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score # knn = KNeighborsClassifier(n_neighbors=5)#邻居的数目 # scores = cross_val_score(knn,X,Y,cv=10,scoring='accuracy')#进行10折交叉验证,既把样本分成10份,每一份都作为测试集,得到10次预测精度。 # knn.fit(X_train, y_train) # print(scores.mean())#将10次预测精度的平均值打印出来。 #选择最优的K值,训练不同的K值的K近邻模型,得到10折交叉验证的预测精度 k_range=range(1,31) k_scores=[] for k in k_range: knn=KNeighborsClassifier(n_neighbors=k) scores=cross_val_score(knn,X,Y,cv=10,scoring='accuracy') k_scores.append(scores.mean()) print(k_scores) #最后通过绘制精度随K值变化的图来选择精度最高的K值 k_range=range(1,31) k_scores=[] for k in k_range: knn=KNeighborsClassifier(n_neighbors=k) scores=cross_val_score(knn,X,Y,cv=10,scoring='accuracy') k_scores.append(scores.mean()) print(k_scores) plt.plot(k_range,k_scores) plt.xlabel('The value of the k in KNN') plt.ylabel('accuracy') plt.show() #得到的结果大概在13、18、20最高 knn = KNeighborsClassifier(n_neighbors=20)#邻居的数目 knn.fit(X_train, y_train) #计算k近邻模型的准确性 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix y_pred = knn.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("准确率:", accuracy) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) print('K近邻模型的准确率为:', accuracy) # 计算精确率 precision = precision_score(y_test, y_pred, average='weighted') print('K近邻模型的精确率为:', precision) # 计算召回率 recall = recall_score(y_test, y_pred,average='weighted') print('K近邻模型的召回率为:', recall) # 计算F1值 f1 = f1_score(y_test, y_pred,average='weighted') print('K近邻模型的F1值为:', f1) # 计算混淆矩阵 conf_matrix = confusion_matrix(y_test, y_pred) print('混淆矩阵:') print(conf_matrix) #决策树模型 from sklearn.metrics import confusion_matrix from sklearn.tree import DecisionTreeClassifier # 创建一个决策树分类器 clf = DecisionTreeClassifier() # 使用训练集训练分类器 clf.fit(X_train, y_train) # 使用测试集进行预测 y_pred = clf.predict(X_test) # 计算分类器的准确率 accuracy = accuracy_score(y_test, y_pred) print("决策树分类器的准确率为:", accuracy) # 计算混淆矩阵 conf_matrix = confusion_matrix(y_test, y_pred) # 提取混淆矩阵中的真正例和假负例 TP = conf_matrix[1, 1] # 真正例 FP = conf_matrix[0, 1] # 假正例 FN = conf_matrix[1, 0] # 假负例 # 计算精确度 precision = TP / (TP + FP) print('决策树模型的精确度为:', precision) # 计算召回率 recall = TP / (TP + FN) print('决策树模型的召回率为:', recall) # 计算F1值 F1 = 2 * (precision * recall) / (precision + recall) print('决策树模型的F1值为:', F1) print('决策树的混淆矩阵',conf_matrix) #可视化决策边界 from sklearn import tree import matplotlib as mpl font2 = {'family' : 'SimHei', 'weight' : 'normal', 'size' : 20, } mpl.rcParams['font.family'] = 'SimHei' mpl.rcParams['axes.unicode_minus'] = False #matplotlib inline from matplotlib import pyplot as plt fig = plt.figure(figsize=(8,8)) tree.plot_tree(clf,filled=True, feature_names=['花萼长', '花萼宽', '花瓣长', '花瓣宽'], class_names=['山鸢尾', '变色鸢尾', '维吉尼亚鸢尾']) plt.show() ## 从sklearn中导入逻辑回归模型 from sklearn.linear_model import LogisticRegression ## 定义 逻辑回归模型 clf = LogisticRegression(random_state=0, solver='lbfgs') # 在训练集上训练逻辑回归模型 clf.fit(X_train, y_train) ## 在训练集和测试集上分布利用训练好的模型进行预测 train_predict = clf.predict(X_train) test_predict = clf.predict(X_test) ## 利用accuracy(准确度)【预测正确的样本数目占总预测样本数目的比例】评估模型效果 print('逻辑回归的准确性为:',accuracy_score(y_train,train_predict)) print('逻辑回归的准确性为:',accuracy_score(y_test,test_predict)) ## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵) confusion_matrix_result = confusion_matrix(test_predict,y_test) print('The confusion matrix result:\n',confusion_matrix_result)