04（2024春季）

学习时长：3小时
代码行数：219
博客数量：1篇
今天我的高中同学委托给我了一个任务，说是替他完成就请我吃东西。
是一个有关于他们学校的一个深度学习的实验，要求用到python。
然后我就去网上搜索，问ai，把要求的内容全拼到一起来，我也不知道写了个什么东西。
# 导入相关包
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
# 导入数据集
data = "iris/data/iris.csv"
iris_local = pd.read_csv(data, usecols=[0, 1, 2, 3, 4])
iris_local = iris_local.dropna()    # 丢弃含空值的行、列
iris_local.head()
iris_local.info()

# 载入特征和标签集
X = iris_local[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]       
Y = iris_local['variety']    

# 对标签集进行编码
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)
print(Y)

#训练数据
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.3, random_state=0)#30%为测试数据集
print("X_train shape: {}".format(X_train.shape))        # X_train shape: (112, 4)
print("y_train shape: {}".format(y_train.shape))        # y_train shape: (112,)



#平均值、标准差、最小值、四分之一位数、中位数等统计信息
print(iris_local.describe())
#绘制箱线图
iris_local.plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False)

#绘制直方图
iris_local.hist()

#绘制kde、核密度
iris_local.plot(kind='kde')

iris_local.plot()



# 绘制  Violinplot
f, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True)
sns.despine(left=True)
# 设置颜色主题
antV = ['#1890FF', '#2FC25B', '#FACC14', '#223273', '#8543E0', '#13C2C2', '#3436c7', '#F04864'] 
sns.violinplot(x='variety', y='sepal.length', data=iris_local, palette=antV, ax=axes[(0, 0)])
sns.violinplot(x='variety', y='sepal.width', data=iris_local, palette=antV, ax=axes[(0, 1)])
sns.violinplot(x='variety', y='petal.length', data=iris_local, palette=antV, ax=axes[(1, 0)])
sns.violinplot(x='variety', y='petal.width', data=iris_local, palette=antV, ax=axes[(1, 1)])
plt.show()

# 绘制  pointplot
f, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True)
sns.despine(left=True)
antV = ['#1890FF', '#2FC25B', '#FACC14', '#223273', '#8543E0', '#13C2C2', '#3436c7', '#F04864'] 
sns.pointplot(x='variety', y='sepal.length', data=iris_local, color=antV[0], ax=axes[0, 0])
sns.pointplot(x='variety', y='sepal.width', data=iris_local, color=antV[0], ax=axes[0, 1])
sns.pointplot(x='variety', y='petal.length', data=iris_local, color=antV[0], ax=axes[1, 0])
sns.pointplot(x='variety', y='petal.width', data=iris_local,color=antV[0], ax=axes[1, 1])
plt.show()
#生成各特征之间关系的矩阵图
g = sns.pairplot(data=iris_local, palette=antV, hue= 'variety')
# 使用 Andrews Curves 将每个多变量观测值转换为曲线并表示傅立叶级数的系数，这对于检测时间序列数据中的异常值很有用。
plt.subplots(figsize = (10,8))
pd.plotting.andrews_curves(iris_local, 'variety', colormap='cool')
plt.show()

#平行坐标
pd.plotting.parallel_coordinates(iris_local, 'variety', colormap = 'brg')

#下面分别基于花萼和花瓣做线性回归的可视化：
g = sns.lmplot(data=iris_local, x='sepal.width', y='sepal.length', palette=antV, hue='variety')
g = sns.lmplot(data=iris_local, x='petal.width', y='petal.length', palette=antV, hue='variety')
plt.show()


#构建模型（k 近邻）
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
# knn = KNeighborsClassifier(n_neighbors=5)#邻居的数目
# scores = cross_val_score(knn,X,Y,cv=10,scoring='accuracy')#进行10折交叉验证，既把样本分成10份，每一份都作为测试集，得到10次预测精度。
# knn.fit(X_train, y_train)
# print(scores.mean())#将10次预测精度的平均值打印出来。

#选择最优的K值，训练不同的K值的K近邻模型，得到10折交叉验证的预测精度
k_range=range(1,31)
k_scores=[]
for k in k_range:
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X,Y,cv=10,scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)

#最后通过绘制精度随K值变化的图来选择精度最高的K值
k_range=range(1,31)
k_scores=[]
for k in k_range:
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X,Y,cv=10,scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)
plt.plot(k_range,k_scores)
plt.xlabel('The value of the k in KNN')
plt.ylabel('accuracy')
plt.show()

#得到的结果大概在13、18、20最高
knn = KNeighborsClassifier(n_neighbors=20)#邻居的数目
knn.fit(X_train, y_train)

#计算k近邻模型的准确性
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("准确率：", accuracy)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print('K近邻模型的准确率为:', accuracy)

# 计算精确率
precision = precision_score(y_test, y_pred, average='weighted')
print('K近邻模型的精确率为:', precision)

# 计算召回率
recall = recall_score(y_test, y_pred,average='weighted')
print('K近邻模型的召回率为:', recall)

# 计算F1值
f1 = f1_score(y_test, y_pred,average='weighted')
print('K近邻模型的F1值为:', f1)

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)
print('混淆矩阵:')
print(conf_matrix)







#决策树模型
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
# 创建一个决策树分类器
clf = DecisionTreeClassifier()
# 使用训练集训练分类器
clf.fit(X_train, y_train)
# 使用测试集进行预测
y_pred = clf.predict(X_test)
# 计算分类器的准确率
accuracy = accuracy_score(y_test, y_pred)
print("决策树分类器的准确率为：", accuracy)

# 计算混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)

# 提取混淆矩阵中的真正例和假负例
TP = conf_matrix[1, 1]  # 真正例
FP = conf_matrix[0, 1]  # 假正例
FN = conf_matrix[1, 0]  # 假负例
# 计算精确度
precision = TP / (TP + FP)
print('决策树模型的精确度为:', precision)
# 计算召回率
recall = TP / (TP + FN)
print('决策树模型的召回率为:', recall)
# 计算F1值
F1 = 2 * (precision * recall) / (precision + recall)
print('决策树模型的F1值为:', F1)
print('决策树的混淆矩阵',conf_matrix)
#可视化决策边界
from sklearn import tree
import matplotlib as mpl
font2 = {'family' : 'SimHei',
'weight' : 'normal',
'size'   : 20,
}
mpl.rcParams['font.family'] = 'SimHei'
mpl.rcParams['axes.unicode_minus'] = False
#matplotlib inline
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(8,8))
tree.plot_tree(clf,filled=True,
               feature_names=['花萼长', '花萼宽', '花瓣长', '花瓣宽'],
               class_names=['山鸢尾', '变色鸢尾', '维吉尼亚鸢尾'])
plt.show()



## 从sklearn中导入逻辑回归模型

from sklearn.linear_model import LogisticRegression

## 定义 逻辑回归模型 
clf = LogisticRegression(random_state=0, solver='lbfgs')
# 在训练集上训练逻辑回归模型
clf.fit(X_train, y_train)



## 在训练集和测试集上分布利用训练好的模型进行预测
train_predict = clf.predict(X_train)
test_predict = clf.predict(X_test)
## 利用accuracy（准确度）【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('逻辑回归的准确性为:',accuracy_score(y_train,train_predict))
print('逻辑回归的准确性为:',accuracy_score(y_test,test_predict))

## 查看混淆矩阵 (预测值和真实值的各类情况统计矩阵)
confusion_matrix_result = confusion_matrix(test_predict,y_test)
print('The confusion matrix result:\n',confusion_matrix_result)
posted @ 2024-05-04 21:54 伐木工熊大阅读(4) 评论(0) 编辑收藏举报
刷新页面返回顶部
zhenaifen

2024/05/04（2024春季）

公告