机器学习实验代码
实验3-逻辑回归(对率)
import numpy as np import matplotlib.pyplot as plt # 读入训练数据 train = np.loadtxt('data.csv', delimiter=',', dtype='int', skiprows=1) train_x = train[:,0] train_y = train[:,1] # 标准化 mu = train_x.mean() sigma = train_x.std() def standardize(x): return (x - mu) / sigma train_z = standardize(train_x) # 参数初始化 theta = np.random.rand(3) # 创建训练数据的矩阵 def to_matrix(x): return np.vstack([np.ones(x.size), x, x ** 2]).T X = to_matrix(train_z) # 预测函数 def f(x): return np.dot(x, theta) # 均方误差 def MSE(x, y): return (1 / x.shape[0]) * np.sum((y - f(x)) ** 2) # 学习率 ETA = 1e-3 # 误差的差值 diff = 1 # 更新次数 count = 0 # 重复学习 error = MSE(X, train_y) while diff > 1e-2: # 使用随机梯度下降法更新参数 p = np.random.permutation(X.shape[0]) print(p) for x, y in zip(X[p,:], train_y[p]): theta = theta - ETA * (f(x) - y) * x # 计算与上一次误差的差值 current_error = MSE(X, train_y) diff = error - current_error error = current_error # 输出日志 count += 1 log = '第 {} 次 : theta = {}, 差值 = {:.4f}' print(log.format(count, theta, diff)) # 绘图确认 x = np.linspace(-3, 3, 100) plt.plot(train_z, train_y, 'o') plt.plot(x, f(to_matrix(x))) plt.show()
信息增益下实现决策树
from cProfile import label from math import log from re import A import numpy as np import operator import csv def loaddata (): dataSet = [[0, 0,0,0,0,0, 'yes'], [1, 0,1,0,0,0,'yes'], [1, 0,0,0,0,0,'yes'], [0, 0,1,0,0,0,'yes'], [2, 0,0,0,0,0,'yes'], [0, 1,0,0,1,1,'yes'], [1, 1,0,1,1,1,'yes'], [1, 1,0,0,1,0, 'yes'], [1, 1,1,1,1,0,'no'], [0, 2,2,0,2,1,'no'], [2, 2,2,2,2,0,'no'], [2, 0,0,2,2,1,'no'], [0, 1,0,1,0,0, 'no'], [2, 1,1,1,0,0,'no'], [1, 1,0,0,1,1,'no'], [2, 0,0,2,2,0,'no'], [0, 0,1,1,1,0,'no']] feature_name = ['a1','a2','a3','a4','a5','a6'] return dataSet, feature_name def entropy(dataSet): #数据集条数 m = len(dataSet) #保存所有的类别及属于该类别的样本数 labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 #print(labelCounts) 内容为 {'yes':8, 'no':9} #保存熵值 e = 0.0 #补充计算信息熵的代码 def log(base,x): #间接实现以2为底 return np.log(x)/np.log(base) for k in labelCounts: t = labelCounts[k]/m e -= t * log(2, t) # print(e) return e def splitDataSet(dataSet, axis, value): #补充按给定特征和特征值划分好的数据集的代码 # axis对应的是特征的索引; retDataSet = [] #遍历数据集 for i in dataSet: if i[axis] == value: retDataSet.append(i) return retDataSet def chooseBestFeature(dataSet): n = len(dataSet[0]) - 1 # print(n) #计数整个数据集的熵 baseEntropy = entropy(dataSet) bestInfoGain = 0.0; bestFeature = -1 #遍历每个特征 for i in range(n): #获取当前特征i的所有可能取值 featList = [example[i] for example in dataSet] # print(featList) uniqueVals = set(featList) newEntropy = 0.0 #遍历特征i的每一个可能的取值 for value in uniqueVals: #按特征i的value值进行数据集的划分 subDataSet = splitDataSet(dataSet, i, value) #补充计算条件熵的代码 Dv = 0 for v in subDataSet: if(v[i] == value): Dv += 1 newEntropy += Dv/len(dataSet) * entropy(subDataSet) #计算信息增益 infoGain = baseEntropy - newEntropy #保存当前最大的信息增益及对应的特征 if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def classVote(classList): #定义字典,保存每个标签对应的个数 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 #排序 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def trainTree(dataSet,feature_name): classList = [example[-1] for example in dataSet] #所有类别都一致 if classList.count(classList[0]) == len(classList): return classList[0] #数据集中没有特征 if len(dataSet[0]) == 1: return classVote(classList) #选择最优划分特征 bestFeat = chooseBestFeature(dataSet) bestFeatName = feature_name[bestFeat] myTree = {bestFeatName:{}} featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) #遍历uniqueVals中的每个值,生成相应的分支 for value in uniqueVals: sub_feature_name = feature_name[:] # 生成在dataSet中bestFeat取值为value的子集; sub_dataset = [] for i in dataSet: if i[bestFeat] == value: sub_dataset.append(i) # 根据得到的子集,生成决策树 myTree[bestFeatName][value] = trainTree(sub_dataset, sub_feature_name) return myTree myDat,feature_name = loaddata() myTree = trainTree(myDat,feature_name) print(myTree)
实验五- BP神经网络
from tokenize import Double import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt from sympy import E1 seed = 2020 import random np.random.seed(seed) # Numpy module. random.seed(seed) # Python random module. plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 plt.close('all') #(1)数据预处理 def preprocess(data): #将非数映射数字 for title in data.columns: if data[title].dtype=='object': encoder = LabelEncoder() data[title] = encoder.fit_transform(data[title]) #去均值和方差归一化 ss = StandardScaler() X = data.drop('好瓜',axis=1) Y = data['好瓜'] X = ss.fit_transform(X) x,y = np.array(X),np.array(Y).reshape(Y.shape[0],1) return x,y #定义Sigmoid def sigmoid(x): return 1/(1+np.exp(-x)) #求导 def d_sigmoid(x): return x*(1-x) #(2)标准BP算法 def standard_BP(x,y,dim=10,eta=0.8,max_iter=500): n_samples = 1 w1 = np.random.random((x.shape[1],dim)) w2 = np.random.random((dim,1)) b1 = np.random.random((n_samples,dim)) b2 = np.random.random((n_samples,1)) # print(w1) #w1 是8 * 10的,w2是 10 * 1的 # print(b2) #b1 是1 * 10的,b2是 1 * 1的 losslist = [] for ite in range(max_iter): loss_per_ite = [] for m in range(x.shape[0]): xi,yi = x[m,:],y[m,:] #xi为第 i 次输入的值 xi,yi = xi.reshape(1,xi.shape[0]),yi.reshape(1,yi.shape[0]) #yi为第i次的标签 xi = np.matrix(xi) yi = np.matrix(yi) ##补充前向传播代码 # if m == 0: # print(xi) # print(yi) out1 = np.dot(xi, w1) + b1 out2 = np.dot(sigmoid(out1), w2) + b2 out2 = sigmoid(out2) loss = np.square(yi - out2)/2 loss = loss[0,0] loss_per_ite.append(loss) print("iter:%d loss:%.4f"%(ite,loss)) ##反向传播 ##补充反向传播代码 g = out2 * (1 - out2) * (yi - out2) # print(g) bh = sigmoid(out1) # print(bh) e = np.multiply(bh, (1 - bh)) e = np.multiply(e, w2.T).T * g # print(e.shape) ##补充参数更新代码 w2 = w2 + eta * bh.T * g b2 = b2 + eta * g w1 = w1 + eta * np.dot(e, xi).T b1 = b1 + eta * e.T # print(w1.shape) # print(b1.shape) # print(w2.shape) # print(b2.shape) losslist.append(np.mean(loss_per_ite)) ##Loss可视化 plt.figure() ##补充Loss可视化代码 x = np.arange(0, 500, 1) plt.plot(x, losslist) plt.show() return w1,w2,b1,b2 #(3)测试 data = pd.read_table('watermelon30.txt',delimiter=',') data.drop('编号',axis=1,inplace=True) x,y = preprocess(data) # print(x) # print(y) dim = 10 w1,w2,b1,b2 = standard_BP(x,y,dim) #根据当前的x,预测其类别; u1 = np.dot(x,w1)+b1 out1 = sigmoid(u1) u2 = np.dot(out1,w2)+b2 out2 = sigmoid(u2) y_pred = np.round(out2) result = pd.DataFrame(np.hstack((y,y_pred)),columns=['真值','预测'] ) result.to_excel('result.xlsx',index=False)
本文作者:伍六柒-
本文链接:https://www.cnblogs.com/paper-plane/p/16009694.html
版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步