import numpy as np import struct import random import matplotlib.pyplot as plt import pandas as pd import math class Dataset: def __init__(self, images, labels): self.images = images self.labels = labels # 获取他的一个item, dataset = Dataset(), dataset[index] def __getitem__(self, index): return self.images[index], self.labels[index] # 获取数据集的长度,个数 def __len__(self): return len(self.images) class DataLoaderIterator: def __init__(self, dataloader): self.dataloader = dataloader self.cursor = 0 self.indexs = list(range(self.dataloader.count_data)) # 0, ... 60000 if self.dataloader.shuffle: # 打乱一下 np.random.shuffle(self.indexs) def __next__(self): if self.cursor >= self.dataloader.count_data: raise StopIteration() batch_data = [] remain = min(self.dataloader.batch_size, self.dataloader.count_data - self.cursor) # 256, 128 for n in range(remain): index = self.indexs[self.cursor] data = self.dataloader.dataset[index] # 如果batch没有初始化,则初始化n个list成员 if len(batch_data) == 0: batch_data = [[] for i in range(len(data))] # 直接append进去 for index, item in enumerate(data): batch_data[index].append(item) self.cursor += 1 # 通过np.vstack一次性实现合并,而非每次一直在合并 for index in range(len(batch_data)): batch_data[index] = np.vstack(batch_data[index]) return batch_data class DataLoader: # shuffle 打乱 def __init__(self, dataset, batch_size, shuffle): self.dataset = dataset self.shuffle = shuffle self.count_data = len(dataset) self.batch_size = batch_size def __iter__(self): return DataLoaderIterator(self) class Module: def __init__(self, name): self.name = name self.train_mode = False def __call__(self, *args): return self.forward(*args) def train(self): self.train_mode = True for m in self.modules(): m.train() def eval(self): self.train_mode = False for m in self.modules(): m.eval() def modules(self): ms = [] for attr in self.__dict__: m = self.__dict__[attr] if isinstance(m, Module): ms.append(m) return ms def params(self): ps = [] for attr in self.__dict__: p = self.__dict__[attr] if isinstance(p, Parameter): ps.append(p) ms = self.modules() for m in ms: ps.extend(m.params()) return ps def info(self, n): ms = self.modules() output = f"{self.name}\n" for m in ms: output += (' ' * (n + 1)) + f"{m.info(n + 1)}\n" return output[:-1] def __repr__(self): return self.info(0) class Initializer: def __init__(self, name): self.name = name def __call__(self, *args): return self.apply(*args) class GaussInitializer(Initializer): # where :math:`\mu` is the mean and :math:`\sigma` the standard # deviation. The square of the standard deviation, :math:`\sigma^2`, # is called the variance. def __init__(self, mu, sigma): self.mu = mu self.sigma = sigma def apply(self, value): value[...] = np.random.normal(self.mu, self.sigma, value.shape) class Parameter: def __init__(self, value): self.value = value self.delta = np.zeros(value.shape) def zero_grad(self): self.delta[...] = 0 class Linear(Module): def __init__(self, input_feature, output_feature): super().__init__("Linear") self.input_feature = input_feature self.output_feature = output_feature self.weights = Parameter(np.zeros((input_feature, output_feature))) self.bias = Parameter(np.zeros((1, output_feature))) # 权重初始化 initer = GaussInitializer(0, np.sqrt(2 / input_feature)) # np.sqrt(2 / input_feature) initer.apply(self.weights.value) def forward(self, x): self.x_save = x.copy() return x @ self.weights.value + self.bias.value # AB = C G # dB = A.T @ G # dA = G @ B.T def backward(self, G): self.weights.delta += self.x_save.T @ G self.bias.delta += np.sum(G, 0) # 值复制 return G @ self.weights.value.T class ReLU(Module): def __init__(self, inplace=True): super().__init__("ReLU") self.inplace = inplace # 亿点点 def forward(self, x): self.negative_position = x < 0 if not self.inplace: x = x.copy() x[self.negative_position] = 0 return x def backward(self, G): if not self.inplace: G = G.copy() G[self.negative_position] = 0 return G def sigmoid(x): p0 = x < 0 p1 = ~p0 x = x.copy() # 如果x的类型是整数,那么会造成丢失精度 x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0])) x[p1] = 1 / (1 + np.exp(-x[p1])) return x class SWish(Module): def __init__(self): super().__init__("SWish") def forward(self, x): self.x_save = x.copy() self.sx = sigmoid(x) return x * self.sx def backward(self, G): return G * (self.sx + self.x_save * self.sx * (1 - self.sx)) class Dropout(Module): def __init__(self, prob_keep=0.5, inplace=True): super().__init__("Dropout") self.prob_keep = prob_keep self.inplace = inplace def forward(self, x): if not self.train_mode: return x self.mask = np.random.binomial(size=x.shape, p=1 - self.prob_keep, n=1) if not self.inplace: x = x.copy() x[self.mask] = 0 x *= 1 / self.prob_keep return x def backward(self, G): if not self.inplace: G = G.copy() G[self.mask] = 0 G *= 1 / self.prob_keep return G class ModuleList(Module): def __init__(self, *args): super().__init__("ModuleList") self.ms = list(args) def modules(self): return self.ms def forward(self, x): for m in self.ms: x = m(x) return x def backward(self, G): for i in range(len(self.ms) - 1, -1, -1): G = self.ms[i].backward(G) return G class SigmoidCrossEntropy(Module): def __init__(self, params, weight_decay=1e-5): super().__init__("CrossEntropyLoss") self.params = params self.weight_decay = weight_decay def sigmoid(self, x): # return 1 / (1 + np.exp(-x)) p0 = x < 0 p1 = ~p0 x = x.copy() x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0])) x[p1] = 1 / (1 + np.exp(-x[p1])) return x def decay_loss(self): loss = 0 for p in self.params: loss += np.sqrt(np.sum(p.value ** 2)) / (2 * p.value.size) * self.weight_decay return loss def decay_backward(self): for p in self.params: eps = 1e-8 p.delta += 1 / (2 * np.sqrt(np.sum(p.value ** 2)) + eps) / ( 2 * p.value.size) * self.weight_decay * 2 * p.value def forward(self, x, label_onehot): eps = 1e-6 self.label_onehot = label_onehot self.predict = self.sigmoid(x) self.predict = np.clip(self.predict, a_max=1 - eps, a_min=eps) # 裁切 self.batch_size = self.predict.shape[0] return -np.sum(label_onehot * np.log(self.predict) + (1 - label_onehot) * np.log(1 - self.predict)) / self.batch_size + self.decay_loss() def backward(self): self.decay_backward() return (self.predict - self.label_onehot) / self.batch_size class SoftmaxCrossEntropy(Module): def __init__(self): super().__init__("SoftmaxCrossEntropy") def softmax(self, x): # return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True) max_x = np.max(x, axis=1, keepdims=True) exp_x = np.exp(x - max_x) return exp_x / np.sum(exp_x, axis=1, keepdims=True) def forward(self, x, label_onehot): eps = 1e-6 self.label_onehot = label_onehot self.predict = self.softmax(x) self.predict = np.clip(self.predict, a_max=1 - eps, a_min=eps) # 裁切 self.batch_size = self.predict.shape[0] return -np.sum(label_onehot * np.log(self.predict)) / self.batch_size def backward(self): return (self.predict - self.label_onehot) / self.batch_size class Optimizer: def __init__(self, name, model, lr): self.name = name self.model = model self.lr = lr self.params = model.params() def zero_grad(self): for param in self.params: param.zero_grad() def set_lr(self, lr): self.lr = lr class SGD(Optimizer): def __init__(self, model, lr=1e-3): super().__init__("SGD", model, lr) def step(self): for param in self.params: param.value -= self.lr * param.delta class SGDMomentum(Optimizer): def __init__(self, model, lr=1e-3, momentum=0.9): super().__init__("SGDMomentum", model, lr) self.momentum = momentum for param in self.params: param.v = 0 # 移动平均 def step(self): for param in self.params: param.v = self.momentum * param.v - self.lr * param.delta param.value += param.v class Adam(Optimizer): def __init__(self, model, lr=1e-3, beta1=0.9, beta2=0.999, l2_regularization=0): super().__init__("Adam", model, lr) self.beta1 = beta1 self.beta2 = beta2 self.l2_regularization = l2_regularization self.t = 0 for param in self.params: param.m = 0 param.v = 0 # 指数移动平均 def step(self): eps = 1e-8 self.t += 1 for param in self.params: g = param.delta param.m = self.beta1 * param.m + (1 - self.beta1) * g param.v = self.beta2 * param.v + (1 - self.beta2) * g ** 2 mt_ = param.m / (1 - self.beta1 ** self.t) vt_ = param.v / (1 - self.beta2 ** self.t) param.value -= self.lr * mt_ / (np.sqrt(vt_) + eps) + self.l2_regularization * param.value class Model(Module): def __init__(self, num_feature, num_hidden, num_classes): super().__init__("Model") self.backbone = ModuleList( Linear(num_feature, num_hidden), ReLU(), Dropout(), Linear(num_hidden, num_classes) ) def forward(self, x): return self.backbone(x) def backward(self, G): return self.backbone.backward(G) def sigmoid(x): return 1 / (1 + np.exp(-x)) def estimate_val(predict, gt_labels, classes, loss_func): plabel = predict.argmax(1) positive = plabel == gt_labels total_images = predict.shape[0] accuracy = sum(positive) / total_images return accuracy, loss_func(predict, one_hot(gt_labels, classes)) def lr_schedule_cosine(lr_min, lr_max, per_epochs): def compute(epoch): return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch / per_epochs * np.pi)) return compute def load_images(file): with open(file, "rb") as f: data = f.read() magic_number, num_samples, image_width, image_height = struct.unpack(">iiii", data[:16]) if magic_number != 2051: # 0x00000803 print(f"magic number mismatch {magic_number} != 2051") return None image_data = np.frombuffer(data[16:], dtype=np.uint8).reshape(num_samples, -1) return image_data def one_hot(labels, classes, label_smoothing=0): n = len(labels) eoff = label_smoothing / classes output = np.ones((n, classes), dtype=np.float32) * eoff for row, label in enumerate(labels): output[row, label] = 1 - label_smoothing + eoff return output def load_labels(file): with open(file, "rb") as f: data = f.read() magic_number, num_samples = struct.unpack(">ii", data[:8]) if magic_number != 2049: # 0x00000801 print(f"magic number mismatch {magic_number} != 2049") return None labels = np.array(list(data[8:])) return labels val_labels = load_labels("E:/杜老师课程/dataset/t10k-labels-idx1-ubyte") # 10000, val_images = load_images("E:/杜老师课程/dataset/t10k-images-idx3-ubyte") # 10000, 784 numdata = val_images.shape[0] # 60000 val_images = np.hstack((val_images / 255 - 0.5, np.ones((numdata, 1)))) # 10000, 785 val_pd = pd.DataFrame(val_labels, columns=["label"]) train_labels = load_labels("E:/杜老师课程/dataset/train-labels-idx1-ubyte") # 60000, train_images = load_images("E:/杜老师课程/dataset/train-images-idx3-ubyte") # 60000, 784 numdata = train_images.shape[0] # 60000 train_images = np.hstack((train_images / 255 - 0.5, np.ones((numdata, 1)))) # 60000, 785 train_pd = pd.DataFrame(train_labels, columns=["label"]) np.random.seed(3) classes = 10 # 定义10个类别 batch_size = 64 # 定义每个批次的大小 epochs = 20 # 退出策略,也就是最大把所有数据看10次 lr = 1e-2 numdata, data_dims = train_images.shape # 60000, 784 # 定义dataloader和dataset,用于数据抓取 train_data = DataLoader(Dataset(train_images, one_hot(train_labels, classes)), batch_size, shuffle=True) model = Model(data_dims, 1024, classes) # loss_func = SoftmaxCrossEntropy() loss_func = SigmoidCrossEntropy(model.params(), 0) optim = Adam(model, lr) iters = 0 # 定义迭代次数,因为我们需要展示loss曲线,那么x将会是iters lr_schedule = { 5: 1e-3, 15: 1e-4, 18: 1e-5 } # 开始进行epoch循环,总数是epochs次 for epoch in range(epochs): if epoch in lr_schedule: lr = lr_schedule[epoch] optim.set_lr(lr) model.train() # 对一个批次内的数据进行迭代,每一次迭代都是一个batch(即256) for index, (images, labels) in enumerate(train_data): x = model(images) # 计算loss值 loss = loss_func(x, labels) optim.zero_grad() G = loss_func.backward() model.backward(G) optim.step() # 应用梯度,更新参数 iters += 1 print(f"Iter {iters}, {epoch} / {epochs}, Loss {loss:.3f}, LR {lr:g}") model.eval() val_accuracy, val_loss = estimate_val(model(val_images), val_labels, classes, loss_func) print(f"Val set, Accuracy: {val_accuracy:.6f}, Loss: {val_loss:.3f}") def load_labels(file): with open(file, "rb") as f: data = f.read() magic_number, num_samples = struct.unpack(">ii", data[:8]) if magic_number != 2049: # 0x00000801 print(f"magic number mismatch {magic_number} != 2049") return None labels = np.frombuffer(data[8:], dtype=np.uint8) return labels