问题描述

选择两个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率。

解答

选了一个wine的数据集,一共将近1600条数据,留一法快把电脑跑死机了。
Logistic 回归可以用 sklearn 库,这里用上一节写的函数,稍微修改一下学习率等。最终算的是正确率。
最终结果:

10折交叉验证准确率:0.7404
留一法准确率:0.7398

代码如下:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

def loss(X, y, beta): # 根据损失公式计算当前参数 beta 时打损失
    loss1 = np.sum(-y * (X.dot(beta)))
    loss2 = np.sum(np.log(1 + np.exp(X.dot(beta))))
    return loss1 + loss2

def grad(X, y, beta): # 根据梯度公式计算当前参数 beta 时的一阶梯度
    g = np.exp(X.dot(beta))
    grad1 = -y.dot(X) 
    grad2 = (g / (1.0 + g)).dot(X)
    return grad1 + grad2 

def score(X, y_true, beta): # 计算准确率
    y_pred = 1. / (1 + np.exp(-X.dot(beta)))
    y_pred = np.array(y_pred >= 0.5, dtype=np.float)
    acc = (y_pred == y_true).sum() / len(y_pred)
    return acc

def logistic_reg_fit(X, y): # 输入训练数据和标签,返回训练好的权重
    episilon = 1e-5  # 如果两次更新损失小于次,停止更新
    max_epoch = 500 # 最大迭代次数
    alpha = 0.0005     # 学习率

    beta = np.random.random(X.shape[1]) # 待学习参数

    for i in range(max_epoch):
        last_beta = beta.copy()
        grad_beta = grad(X, y, beta) # 计算梯度
        beta -= alpha * grad_beta # 梯度下降法

        if (abs(loss(X,y,beta) - loss(X,y,grad_beta))<episilon):
            break
    return beta

# 数据源: "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine = pd.read_csv("e:/data/uci/winequality-red.csv", sep=";")

X, y = wine[wine.columns[:-1]], wine[wine.columns[-1]]
# y:qulity 是酒的品质,为0-10之间的整数值,为了应用二分类,把0-5划分成类0,6-11为类1
y = y.apply(lambda label:0 if label<=5 else 1)

scaler = StandardScaler() # 对数据进行标准化
X = scaler.fit_transform(X) 
X = np.hstack([np.ones((len(X),1)), X]) # 最左面加1,截距项
print(X.shape, y.shape)

## 10 折交叉验证
from sklearn.model_selection import KFold

kf = KFold(n_splits=3)

accs = []
for train_idx, test_idx in kf.split(X):
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    beta = logistic_reg_fit(X_train, y_train)
    acc = score(X_test, y_test, beta)
    accs.append(acc)
print(np.mean(accs))


## 留一法验证
from sklearn.model_selection import LeaveOneOut

onef = LeaveOneOut()

accs = []
for train_idx, test_idx in onef.split(X):
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]
    beta = logistic_reg_fit(X_train, y_train)
    acc = score(X_test, y_test, beta)
    accs.append(acc)
print(np.mean(accs))