题目描述:

编程实现对率回归,并给出西瓜数据集3.0\(\alpha\)上的结果。

编程实现

对数几率回归最小化损失函数(西瓜书公式3.27)如下:

\[l(\beta) = \sum_{i=1}^m (-y_i\beta ^T x_i + ln(1+e^{\beta^T x_i})) \]

证明:

\[suppose: g_i = g_i(\beta) = e^{\beta^T x_i} \\ g_i > 0 \\ g_i' = g_i x_i \\ \nabla l(\beta) = \sum_{i=1}^m (-y_i x_i + \frac{g_i'}{1+g_i}) = \sum_{i=1}^m (-y_i x_i + \frac{g_i}{1+g_i}x_i) \\ \]

更新方式为

\[w \gets w - \alpha \nabla f(w) \]

\(\alpha\) 为步长。

先展示结果

数据可视化和损失变化

Accuracy = 0.71
import numpy as np 
import matplotlib.pyplot as plt 

def loss(X, y, beta): # 根据损失公式计算当前参数 beta 时打损失
    loss1 = np.sum(-y * (X.dot(beta)))
    loss2 = np.sum(np.log(1 + np.exp(X.dot(beta))))
    return loss1 + loss2

def grad(X, y, beta): # 根据梯度公式计算当前参数 beta 时的一阶梯度
    g = np.exp(X.dot(beta))
    grad1 = -y.dot(X) 
    grad2 = (g / (1.0 + g)).dot(X)
    return grad1 + grad2 

def get_data(data): # 根据原始数据得到增加1截距项X,y,这里就不化分训练和测试集
    X = [(item.split(',')) for item in data.split()]
    X = np.array(X, dtype=np.float)
    y = X[:, 3]
    X = X[:,[1,2]]
    X = np.hstack([np.ones((len(X),1)), X])
    return X, y

def get_acc(X, y_true, beta): # 计算准确率
    y_pred = 1. / (1 + np.exp(-X.dot(beta)))
    y_pred = np.array(y_pred >= 0.5, dtype=np.float)
    acc = (y_pred == y_true).sum() / len(y_pred)
    return acc

def logistic_regression(X, y):
    episilon = 1e-5  # 如果两次更新损失小于次,停止更新
    max_epoch = 1000 # 最大迭代次数
    alpha = 0.1      # 学习率

    beta = np.random.random(X.shape[1]) # 待学习参数
    losses = [] # 记录损失

    for i in range(max_epoch):
        last_beta = beta.copy()
        grad_beta = grad(X, y, beta) # 计算梯度
        beta -= alpha * grad_beta # 梯度下降法

        if (abs(loss(X,y,beta) - loss(X,y,grad_beta))<episilon):
            break
        losses.append(loss(X, y, beta)) # 记录损失

    acc = get_acc(X, y, beta)
    return acc, losses

# 编号,密度,含糖率,好1(坏0)瓜
data = """
1,0.697,0.46,1
2,0.774,0.376,1
3,0.634,0.264,1
4,0.608,0.318,1
5,0.556,0.215,1
6,0.403,0.237,1
7,0.481,0.149,1
8,0.437,0.211,1
9,0.666,0.091,0
10,0.243,0.267,0
11,0.245,0.057,0
12,0.343,0.099,0
13,0.639,0.161,0
14,0.657,0.198,0
15,0.36,0.37,0
16,0.593,0.042,0
17,0.719,0.103,0
"""

X, y = get_data(data)
acc, losses = logistic_regression(X, y) # 获取准确率和损失list
print(f"Accuracy = {acc:.2}")

# 可视化
# ax1 显示原数据
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,4))
ax1.scatter(X[y==0,1], X[y==0,2], c='r', label="bad")
ax1.scatter(X[y==1,1], X[y==1,2], c='g', label="good")
ax1.set_xlabel("density", fontsize=15)
ax1.set_ylabel("sugar", fontsize=15)
ax1.legend()

# ax2 显示损失变化
ax2.plot(range(len(losses)), losses, 'b')
ax2.set_xlabel("epoch", fontsize=15)
ax2.set_ylabel("loss", fontsize=15)
plt.show()