机器学习实验代码

实验3-逻辑回归(对率)

import numpy as np
import matplotlib.pyplot as plt

# 读入训练数据
train = np.loadtxt('data.csv', delimiter=',', dtype='int', skiprows=1)
train_x = train[:,0]
train_y = train[:,1]

# 标准化
mu = train_x.mean()
sigma = train_x.std()
def standardize(x):
    return (x - mu) / sigma

train_z = standardize(train_x)

# 参数初始化
theta = np.random.rand(3)

# 创建训练数据的矩阵
def to_matrix(x):
    return np.vstack([np.ones(x.size), x, x ** 2]).T

X = to_matrix(train_z)

# 预测函数
def f(x):
    return np.dot(x, theta)

# 均方误差
def MSE(x, y):
    return (1 / x.shape[0]) * np.sum((y - f(x)) ** 2)

# 学习率
ETA = 1e-3

# 误差的差值
diff = 1

# 更新次数
count = 0

# 重复学习
error = MSE(X, train_y)
while diff > 1e-2:
    # 使用随机梯度下降法更新参数
    p = np.random.permutation(X.shape[0])
    print(p)
    for x, y in zip(X[p,:], train_y[p]):
        theta = theta - ETA * (f(x) - y) * x

    # 计算与上一次误差的差值
    current_error = MSE(X, train_y)
    diff = error - current_error
    error = current_error

    # 输出日志
    count += 1
    log = '第 {} 次 : theta = {}, 差值 = {:.4f}'
    print(log.format(count, theta, diff))
    
# 绘图确认
x = np.linspace(-3, 3, 100)
plt.plot(train_z, train_y, 'o')
plt.plot(x, f(to_matrix(x)))
plt.show()

信息增益下实现决策树

from cProfile import label
from math import log
from re import A
import numpy as np
import operator
import csv

def loaddata ():
    dataSet = [[0, 0,0,0,0,0, 'yes'],
               [1, 0,1,0,0,0,'yes'],
               [1, 0,0,0,0,0,'yes'],
               [0, 0,1,0,0,0,'yes'],
               [2, 0,0,0,0,0,'yes'],
               [0, 1,0,0,1,1,'yes'],
               [1, 1,0,1,1,1,'yes'],
               [1, 1,0,0,1,0, 'yes'],
               [1, 1,1,1,1,0,'no'],
               [0, 2,2,0,2,1,'no'],
               [2, 2,2,2,2,0,'no'],
               [2, 0,0,2,2,1,'no'],
               [0, 1,0,1,0,0, 'no'],
               [2, 1,1,1,0,0,'no'],
               [1, 1,0,0,1,1,'no'],
               [2, 0,0,2,2,0,'no'],
               [0, 0,1,1,1,0,'no']]
    feature_name = ['a1','a2','a3','a4','a5','a6']

    return dataSet, feature_name

def entropy(dataSet):
    #数据集条数
    m = len(dataSet)
#保存所有的类别及属于该类别的样本数
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): 
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    #print(labelCounts) 内容为 {'yes':8, 'no':9}
    #保存熵值
    e = 0.0 
#补充计算信息熵的代码
    
    def log(base,x): #间接实现以2为底
        return np.log(x)/np.log(base)

    for k in labelCounts:
        t = labelCounts[k]/m
        e -= t * log(2, t)
    # print(e)
    return e

def splitDataSet(dataSet, axis, value):
#补充按给定特征和特征值划分好的数据集的代码
# axis对应的是特征的索引;
    retDataSet = []
#遍历数据集
    for i in dataSet:
        if i[axis] == value:
            retDataSet.append(i)
    return retDataSet

def chooseBestFeature(dataSet):
    n = len(dataSet[0]) - 1
    # print(n)
    #计数整个数据集的熵
    baseEntropy = entropy(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    #遍历每个特征
    for i in range(n):  
        #获取当前特征i的所有可能取值
        featList = [example[i] for example in dataSet]
        # print(featList)
        uniqueVals = set(featList) 
        newEntropy = 0.0
        #遍历特征i的每一个可能的取值
        for value in uniqueVals:
            #按特征i的value值进行数据集的划分
            subDataSet = splitDataSet(dataSet, i, value)
            #补充计算条件熵的代码
            Dv = 0
            for v in subDataSet:
                if(v[i] == value):
                    Dv += 1
            newEntropy += Dv/len(dataSet) * entropy(subDataSet)
        #计算信息增益
        infoGain = baseEntropy - newEntropy  
        #保存当前最大的信息增益及对应的特征
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def classVote(classList):
    #定义字典,保存每个标签对应的个数 
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): 
            classCount[vote] = 0
        classCount[vote] += 1
     #排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def trainTree(dataSet,feature_name):
    classList = [example[-1] for example in dataSet]
#所有类别都一致
    if classList.count(classList[0]) == len(classList): 
        return classList[0] 
#数据集中没有特征
    if len(dataSet[0]) == 1: 
        return classVote(classList)
#选择最优划分特征
    bestFeat = chooseBestFeature(dataSet)
    bestFeatName = feature_name[bestFeat]
    myTree = {bestFeatName:{}}
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
#遍历uniqueVals中的每个值,生成相应的分支
    for value in uniqueVals:
        sub_feature_name = feature_name[:]
        # 生成在dataSet中bestFeat取值为value的子集;
        sub_dataset = []
        for i in dataSet:
            if i[bestFeat] == value:
                sub_dataset.append(i)
        # 根据得到的子集,生成决策树
        myTree[bestFeatName][value] = trainTree(sub_dataset, sub_feature_name)
    return myTree

myDat,feature_name = loaddata()
myTree = trainTree(myDat,feature_name)
print(myTree)

实验五- BP神经网络

from tokenize import Double
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sympy import E1
seed = 2020
import random
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.

plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
plt.close('all')
#(1)数据预处理
def preprocess(data):
    #将非数映射数字
    for title in data.columns:
        if data[title].dtype=='object':
            encoder = LabelEncoder()
            data[title] = encoder.fit_transform(data[title])         
    #去均值和方差归一化
    ss = StandardScaler()
    X = data.drop('好瓜',axis=1)
    Y = data['好瓜']
    X = ss.fit_transform(X)
    x,y = np.array(X),np.array(Y).reshape(Y.shape[0],1)
    return x,y
#定义Sigmoid 
def sigmoid(x):
    return 1/(1+np.exp(-x))
#求导
def d_sigmoid(x):
    return x*(1-x)
#(2)标准BP算法
def standard_BP(x,y,dim=10,eta=0.8,max_iter=500): 
    n_samples = 1
    w1 = np.random.random((x.shape[1],dim))
    w2 = np.random.random((dim,1))
    b1 = np.random.random((n_samples,dim))
    b2 = np.random.random((n_samples,1))
    # print(w1) #w1 是8 * 10的,w2是 10 * 1的
    # print(b2) #b1 是1 * 10的,b2是 1 * 1的
    losslist = []
    for ite in range(max_iter):
        loss_per_ite = []
        for m in range(x.shape[0]):
            xi,yi = x[m,:],y[m,:] #xi为第 i 次输入的值
            xi,yi = xi.reshape(1,xi.shape[0]),yi.reshape(1,yi.shape[0]) #yi为第i次的标签
            xi = np.matrix(xi)
            yi = np.matrix(yi)
            ##补充前向传播代码   
            # if m == 0:
            #     print(xi)
            #     print(yi)
            out1 = np.dot(xi, w1) + b1
            out2 = np.dot(sigmoid(out1), w2) + b2
            out2 = sigmoid(out2)
            loss = np.square(yi - out2)/2
            loss = loss[0,0]
            loss_per_ite.append(loss)
            print("iter:%d  loss:%.4f"%(ite,loss))
            ##反向传播
            ##补充反向传播代码
            g = out2 * (1 - out2) * (yi - out2)
            # print(g)
            bh = sigmoid(out1)
            # print(bh)
            e = np.multiply(bh, (1 - bh))
            e = np.multiply(e, w2.T).T * g
            # print(e.shape)
            ##补充参数更新代码
            w2 = w2 + eta * bh.T * g
            b2 = b2 + eta * g
            w1 = w1 + eta * np.dot(e, xi).T
            b1 = b1 + eta * e.T
            # print(w1.shape)
            # print(b1.shape)
            # print(w2.shape)
            # print(b2.shape)
        losslist.append(np.mean(loss_per_ite))
    ##Loss可视化
    plt.figure()
##补充Loss可视化代码
    x = np.arange(0, 500, 1)
    plt.plot(x, losslist)
    plt.show()
    return w1,w2,b1,b2
#(3)测试
data = pd.read_table('watermelon30.txt',delimiter=',')
data.drop('编号',axis=1,inplace=True)
x,y = preprocess(data)
# print(x)
# print(y)
dim = 10
w1,w2,b1,b2 = standard_BP(x,y,dim)
#根据当前的x,预测其类别;
u1 = np.dot(x,w1)+b1
out1 = sigmoid(u1)
u2 = np.dot(out1,w2)+b2
out2 = sigmoid(u2)  
y_pred = np.round(out2)
result = pd.DataFrame(np.hstack((y,y_pred)),columns=['真值','预测'] )     
result.to_excel('result.xlsx',index=False)

posted @ 2022-03-15 18:16  伍六柒-  阅读(138)  评论(0编辑  收藏  举报