CS231N Assigenment1 two_layer_net笔记
two_layer_net.ipynb
之前对 x.reshape(x.shape[0], -1)语句的输出结果理解一直有误:
1 x = [[1,4,7,2],[2,5,7,4]] 2 x = np.array(x) 3 x0 = x.reshape(x.shape[0], -1) 4 x1 = x.reshape(x.shape[1], -1) 5 print(x0) 6 print(x1)
的输出实际为
[[1 4 7 2] [2 5 7 4]] [[1 4] [7 2] [2 5] [7 4]]
Affine layer: forward
# Test the affine_forward function num_inputs = 2 input_shape = (4, 5, 6) output_dim = 3 input_size = num_inputs * np.prod(input_shape) #print(np.prod(input_shape)) #120 weight_size = output_dim * np.prod(input_shape) x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) #prin(t(*input_shape)# 4 5 6 print(np.shape(x))# (2, 4, 5, 6) w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim) b = np.linspace(-0.3, 0.1, num=output_dim) out, _ = affine_forward(x, w, b) correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297], [ 3.25553199, 3.5141327, 3.77273342]]) # Compare your output with ours. The error should be around e-9 or less. print('Testing affine_forward function:') print('difference: ', rel_error(out, correct_out))
要补充的函数为
def affine_forward(x, w, b): out = None x_vector = x.reshape(x.shape[0], -1) out = x_vector.dot(w) out += b return out, cache
Affine layer: backward
# Test the affine_forward function num_inputs = 2 input_shape = (4, 5, 6) output_dim = 3 input_size = num_inputs * np.prod(input_shape) #print(np.prod(input_shape)) #120 weight_size = output_dim * np.prod(input_shape) x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) #print(*input_shape)# 4 5 6 print(np.shape(x))# (2, 4, 5, 6) w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim) b = np.linspace(-0.3, 0.1, num=output_dim) out, _ = affine_forward(x, w, b) correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297], [ 3.25553199, 3.5141327, 3.77273342]]) # Compare your output with ours. The error should be around e-9 or less. print('Testing affine_forward function:') print('difference: ', rel_error(out, correct_out))
其中函数为
def affine_backward(dout, cache): x, w, b = cache dx, dw, db = None, None, None # shape :x(10*2*3) w(6*5) b(5) out(10*5) dx = np.dot(dout, w.T) # (N, M) dot (D, M).T -> (N, D): (10, 6) dx = dx.reshape(x.shape) # 将 dx 调整为与 x 相同的形状: (10*2*3) dw = np.dot(x.reshape(x.shape[0], -1).T, dout) # (D, N) dot (N, M) -> (D, M) 6*10 dot 10*5 = 6*5 db = np.sum(dout, axis=0) # 沿着样本维度求和,得到 (M,) 形状的梯度 return dx, dw, db
ReLU activation
正向
out = np.maximum(x,0)
反向的话刚开始理解错了,写成了
dx=np.maximum(dx,0)
显然是错误的,应当判断x是否小于0而不是dx
dx = np.copy(dout) dx[x <= 0] = 0
即可。
Sandwich layers
看一下layer_utis.py,看起来就是affine和relu封装在了一起,误差也在e-11 e-12这样
Loss layers: SVM & Softmax
svm
def svm_loss(x, y): loss, dx = None, None num_train = x.shape[0] scores = x - np.max(x, axis=1, keepdims=True) correct_class_scores = scores[np.arange(num_train), y] margins = np.maximum(0, scores - correct_class_scores[:, np.newaxis] + 1) margins[np.arange(num_train), y] = 0 loss = np.sum(margins) / num_train num_pos = np.sum(margins > 0, axis=1) dx = np.zeros_like(x) dx[margins > 0] = 1 dx[np.arange(num_train), y] -= num_pos dx /= num_train return loss, dx
造了个3,4的看一下过程
x: [[ 4.17943411e-04 1.39710028e-03 -1.78590431e-03] [-7.08827734e-04 -7.47253161e-05 -7.75016769e-04] [-1.49797903e-04 1.86172902e-03 -1.42552930e-03] [-3.76356699e-04 -3.42275390e-04 2.94907637e-04]] y: [2 1 1 0] scores: [[-0.00097916 0. -0.003183 ] [-0.0006341 0. -0.00070029] [-0.00201153 0. -0.00328726] [-0.00067126 -0.00063718 0. ]] correct class scores: [-0.003183 0. 0. -0.00067126]
margins: [[1.00220385 1.003183 1. ] [0.9993659 1. 0.99929971] [0.99798847 1. 0.99671274] [1. 1.00003408 1.00067126]] margins: [[1.00220385 1.003183 0. ] [0.9993659 0. 0.99929971] [0.99798847 0. 0.99671274] [0. 1.00003408 1.00067126]] num_pos: [2 2 2 2] dx: [[1. 1. 0.] [1. 0. 1.] [1. 0. 1.] [0. 1. 1.]] dx: [[ 1. 1. -2.] [ 1. -2. 1.] [ 1. -2. 1.] [-2. 1. 1.]]
softmax
dx去掉了x转置dot(dscore)
def softmax_loss(x, y): loss, dx = None, None *START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** num_train = x.shape[0] scores = x - np.max(x, axis=1, keepdims=True) exp_scores = np.exp(scores) #correct_class_scores = scores[np.arange(num_train), y] probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) loss = np.sum(-np.log(probs[np.arange(num_train ), y])) / num_train # Compute the gradient dscores = probs dscores[np.arange(num_train ), y] -= 1 dscores /= num_train dx = dscores return loss, dx
Two-layer network
这一部分就看一下fc_net.py里面TwoLayerNet的类
假设输入维度是D,隐藏(层)维度H,分类数为C
结构是affine - relu - affine - softmax,不包含梯度下降。
模型参数存在字典self.params中
from builtins import range from builtins import object import numpy as np from ..layers import * from ..layer_utils import * class TwoLayerNet(object): def __init__( self, input_dim=3 * 32 * 32, hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0, ): self.params = {} self.reg = reg self.params['W1'] = weight_scale * np.random.randn(input_dim,hidden_dim) self.params['b1'] = np.zeros(hidden_dim) self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes) self.params['b2'] = np.zeros(num_classes) def loss(self, X, y=None): scores = None hidden_layer = np.maximum(0, np.dot(X, self.params['W1']) + self.params['b1']) # ReLU activation scores = np.dot(hidden_layer, self.params['W2']) + self.params['b2'] # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} num_train = X.shape[0] scores -= np.max(scores, axis=1, keepdims=True) # for numerical stability softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) correct_class_scores = softmax_scores[range(num_train), y] data_loss = -np.log(correct_class_scores).mean() reg_loss = 0.5 * self.reg * (np.sum(self.params['W1'] ** 2) + np.sum(self.params['W2'] ** 2)) loss = data_loss + reg_loss # Backward pass dscores = softmax_scores.copy() dscores[range(num_train), y] -= 1 dscores /= num_train grads['W2'] = np.dot(hidden_layer.T, dscores) + self.reg * self.params['W2'] grads['b2'] = np.sum(dscores, axis=0) dhidden = np.dot(dscores, self.params['W2'].T) dhidden[hidden_layer <= 0] = 0 # backpropagate through ReLU grads['W1'] = np.dot(X.T, dhidden) + self.reg * self.params['W1'] grads['b1'] = np.sum(dhidden, axis=0) return loss, grads