CS231N Assigenment1 two_layer_net笔记

two_layer_net.ipynb

之前对 x.reshape(x.shape[0], -1)语句的输出结果理解一直有误:

1 x = [[1,4,7,2],[2,5,7,4]]
2 x = np.array(x)
3 x0 = x.reshape(x.shape[0], -1)
4 x1 = x.reshape(x.shape[1], -1)
5 print(x0)
6 print(x1)

的输出实际为

[[1 4 7 2]
 [2 5 7 4]]
[[1 4]
 [7 2]
 [2 5]
 [7 4]]

Affine layer: forward

# Test the affine_forward function

num_inputs = 2
input_shape = (4, 5, 6)
output_dim = 3

input_size = num_inputs * np.prod(input_shape)
#print(np.prod(input_shape)) #120
weight_size = output_dim * np.prod(input_shape)

x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)
#prin(t(*input_shape)# 4 5 6
print(np.shape(x))# (2, 4, 5, 6)
w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)
b = np.linspace(-0.3, 0.1, num=output_dim)

out, _ = affine_forward(x, w, b)
correct_out = np.array([[ 1.49834967,  1.70660132,  1.91485297],
                        [ 3.25553199,  3.5141327,   3.77273342]])

# Compare your output with ours. The error should be around e-9 or less.
print('Testing affine_forward function:')
print('difference: ', rel_error(out, correct_out))

要补充的函数为

def affine_forward(x, w, b):

    out = None
    
    x_vector = x.reshape(x.shape[0], -1)
    out = x_vector.dot(w)
    out += b

    return out, cache

Affine layer: backward

# Test the affine_forward function

num_inputs = 2
input_shape = (4, 5, 6)
output_dim = 3

input_size = num_inputs * np.prod(input_shape)
#print(np.prod(input_shape)) #120
weight_size = output_dim * np.prod(input_shape)

x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape)
#print(*input_shape)# 4 5 6
print(np.shape(x))# (2, 4, 5, 6)
w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim)
b = np.linspace(-0.3, 0.1, num=output_dim)

out, _ = affine_forward(x, w, b)
correct_out = np.array([[ 1.49834967,  1.70660132,  1.91485297],
                        [ 3.25553199,  3.5141327,   3.77273342]])

# Compare your output with ours. The error should be around e-9 or less.
print('Testing affine_forward function:')
print('difference: ', rel_error(out, correct_out))

其中函数为

def affine_backward(dout, cache):

    x, w, b = cache
    dx, dw, db = None, None, None

    # shape :x(10*2*3) w(6*5) b(5) out(10*5)
    dx = np.dot(dout, w.T)  # (N, M) dot (D, M).T -> (N, D): (10, 6)
    dx = dx.reshape(x.shape)  # 将 dx 调整为与 x 相同的形状: (10*2*3)

    dw = np.dot(x.reshape(x.shape[0], -1).T, dout)  # (D, N) dot (N, M) -> (D, M) 6*10 dot 10*5 = 6*5

    db = np.sum(dout, axis=0)  # 沿着样本维度求和,得到 (M,) 形状的梯度


    return dx, dw, db

 

 ReLU activation

正向

out = np.maximum(x,0)

反向的话刚开始理解错了,写成了

 dx=np.maximum(dx,0)

显然是错误的,应当判断x是否小于0而不是dx

dx = np.copy(dout)  
dx[x <= 0] = 0

即可。

Sandwich layers

看一下layer_utis.py,看起来就是affine和relu封装在了一起,误差也在e-11 e-12这样

 Loss layers: SVM & Softmax

svm

def svm_loss(x, y):
    loss, dx = None, None

    num_train = x.shape[0]
    scores = x - np.max(x, axis=1, keepdims=True)
    correct_class_scores = scores[np.arange(num_train), y]
    margins = np.maximum(0, scores - correct_class_scores[:, np.newaxis] + 1)
    margins[np.arange(num_train), y] = 0
    loss = np.sum(margins) / num_train

    num_pos = np.sum(margins > 0, axis=1)
    dx = np.zeros_like(x)
    dx[margins > 0] = 1
    dx[np.arange(num_train), y] -= num_pos
    dx /= num_train
    
    return loss, dx

造了个3,4的看一下过程

x:  
[[ 4.17943411e-04  1.39710028e-03 -1.78590431e-03]
 [-7.08827734e-04 -7.47253161e-05 -7.75016769e-04]
 [-1.49797903e-04  1.86172902e-03 -1.42552930e-03]
 [-3.76356699e-04 -3.42275390e-04  2.94907637e-04]]

y:  
[2 1 1 0]

scores:  
[[-0.00097916  0.         -0.003183  ]
 [-0.0006341   0.         -0.00070029]
 [-0.00201153  0.         -0.00328726]
 [-0.00067126 -0.00063718  0.        ]]

correct class scores:  
[-0.003183    0.          0.         -0.00067126]
margins:  
[[1.00220385 1.003183   1.        ]
 [0.9993659  1.         0.99929971]
 [0.99798847 1.         0.99671274]
 [1.         1.00003408 1.00067126]]

margins:  
[[1.00220385 1.003183   0.        ]
 [0.9993659  0.         0.99929971]
 [0.99798847 0.         0.99671274]
 [0.         1.00003408 1.00067126]]

num_pos: 
[2 2 2 2]
dx:

[[1. 1. 0.]
 [1. 0. 1.]
 [1. 0. 1.]
 [0. 1. 1.]]

dx:  
[[ 1.  1. -2.]
 [ 1. -2.  1.]
 [ 1. -2.  1.]
 [-2.  1.  1.]]

 

softmax

dx去掉了x转置dot(dscore)

def softmax_loss(x, y):

    loss, dx = None, None
*START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    num_train = x.shape[0]
    scores = x - np.max(x, axis=1, keepdims=True)
    exp_scores = np.exp(scores)
    #correct_class_scores = scores[np.arange(num_train), y]

    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    loss = np.sum(-np.log(probs[np.arange(num_train ), y])) / num_train 

    # Compute the gradient
    dscores = probs
    dscores[np.arange(num_train ), y] -= 1
    dscores /= num_train 

    dx = dscores

    return loss, dx

 Two-layer network

这一部分就看一下fc_net.py里面TwoLayerNet的类

假设输入维度是D,隐藏(层)维度H,分类数为C

结构是affine - relu - affine - softmax,不包含梯度下降。
模型参数存在字典self.params中
from builtins import range
from builtins import object
import numpy as np

from ..layers import *
from ..layer_utils import *


class TwoLayerNet(object):
   
    def __init__(
        self,
        input_dim=3 * 32 * 32,
        hidden_dim=100,
        num_classes=10,
        weight_scale=1e-3,
        reg=0.0,
    ):
        
        self.params = {}
        self.reg = reg

        
        self.params['W1'] = weight_scale * np.random.randn(input_dim,hidden_dim)
        self.params['b1'] = np.zeros(hidden_dim)
        
        self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes)
        self.params['b2'] = np.zeros(num_classes)

     

    def loss(self, X, y=None):
     
        scores = None
        
        hidden_layer = np.maximum(0, np.dot(X, self.params['W1']) + self.params['b1'])  # ReLU activation
        scores = np.dot(hidden_layer, self.params['W2']) + self.params['b2']

  
        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
       
        num_train = X.shape[0]
        scores -= np.max(scores, axis=1, keepdims=True)  # for numerical stability
        softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)
        correct_class_scores = softmax_scores[range(num_train), y]
        data_loss = -np.log(correct_class_scores).mean()
        reg_loss = 0.5 * self.reg * (np.sum(self.params['W1'] ** 2) + np.sum(self.params['W2'] ** 2))
        loss = data_loss + reg_loss

        # Backward pass
        dscores = softmax_scores.copy()
        dscores[range(num_train), y] -= 1
        dscores /= num_train

        grads['W2'] = np.dot(hidden_layer.T, dscores) + self.reg * self.params['W2']
        grads['b2'] = np.sum(dscores, axis=0)

        dhidden = np.dot(dscores, self.params['W2'].T)
        dhidden[hidden_layer <= 0] = 0  # backpropagate through ReLU

        grads['W1'] = np.dot(X.T, dhidden) + self.reg * self.params['W1']
        grads['b1'] = np.sum(dhidden, axis=0)

       
        return loss, grads

 

 

 

 

 

 

posted @ 2024-01-22 13:57  AbeChan33  阅读(59)  评论(0编辑  收藏  举报
邮箱:sihangao2004@gmail.com