问题描述
选择两个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率。
解答
选了一个wine的数据集,一共将近1600条数据,留一法快把电脑跑死机了。
Logistic 回归可以用 sklearn 库,这里用上一节写的函数,稍微修改一下学习率等。最终算的是正确率。
最终结果:
10折交叉验证准确率:0.7404
留一法准确率:0.7398
代码如下:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
def loss(X, y, beta): # 根据损失公式计算当前参数 beta 时打损失
loss1 = np.sum(-y * (X.dot(beta)))
loss2 = np.sum(np.log(1 + np.exp(X.dot(beta))))
return loss1 + loss2
def grad(X, y, beta): # 根据梯度公式计算当前参数 beta 时的一阶梯度
g = np.exp(X.dot(beta))
grad1 = -y.dot(X)
grad2 = (g / (1.0 + g)).dot(X)
return grad1 + grad2
def score(X, y_true, beta): # 计算准确率
y_pred = 1. / (1 + np.exp(-X.dot(beta)))
y_pred = np.array(y_pred >= 0.5, dtype=np.float)
acc = (y_pred == y_true).sum() / len(y_pred)
return acc
def logistic_reg_fit(X, y): # 输入训练数据和标签,返回训练好的权重
episilon = 1e-5 # 如果两次更新损失小于次,停止更新
max_epoch = 500 # 最大迭代次数
alpha = 0.0005 # 学习率
beta = np.random.random(X.shape[1]) # 待学习参数
for i in range(max_epoch):
last_beta = beta.copy()
grad_beta = grad(X, y, beta) # 计算梯度
beta -= alpha * grad_beta # 梯度下降法
if (abs(loss(X,y,beta) - loss(X,y,grad_beta))<episilon):
break
return beta
# 数据源: "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine = pd.read_csv("e:/data/uci/winequality-red.csv", sep=";")
X, y = wine[wine.columns[:-1]], wine[wine.columns[-1]]
# y:qulity 是酒的品质,为0-10之间的整数值,为了应用二分类,把0-5划分成类0,6-11为类1
y = y.apply(lambda label:0 if label<=5 else 1)
scaler = StandardScaler() # 对数据进行标准化
X = scaler.fit_transform(X)
X = np.hstack([np.ones((len(X),1)), X]) # 最左面加1,截距项
print(X.shape, y.shape)
## 10 折交叉验证
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
accs = []
for train_idx, test_idx in kf.split(X):
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
beta = logistic_reg_fit(X_train, y_train)
acc = score(X_test, y_test, beta)
accs.append(acc)
print(np.mean(accs))
## 留一法验证
from sklearn.model_selection import LeaveOneOut
onef = LeaveOneOut()
accs = []
for train_idx, test_idx in onef.split(X):
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
beta = logistic_reg_fit(X_train, y_train)
acc = score(X_test, y_test, beta)
accs.append(acc)
print(np.mean(accs))