『cs231n』作业3问题1选讲_通过代码理解RNN&图像标注训练

RNN神经元理解

单个RNN神经元行为

括号中表示的是维度

向前传播

def rnn_step_forward(x, prev_h, Wx, Wh, b):
  """
  Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
  activation function.

  The input data has dimension D, the hidden state has dimension H, and we use
  a minibatch size of N.

  Inputs:
  - x: Input data for this timestep, of shape (N, D).
  - prev_h: Hidden state from previous timestep, of shape (N, H)
  - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
  - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
  - b: Biases of shape (H,)

  Returns a tuple of:
  - next_h: Next hidden state, of shape (N, H)
  - cache: Tuple of values needed for the backward pass.
  """
  next_h, cache = None, None
  ##############################################################################
  # TODO: Implement a single forward step for the vanilla RNN. Store the next  #
  # hidden state and any values you need for the backward pass in the next_h   #
  # and cache variables respectively.                                          #
  ##############################################################################
  next_h = np.tanh(x.dot(Wx) + prev_h.dot(Wh) + b)
  cache = (x, Wx, Wh, prev_h, next_h)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return next_h, cache

反向传播

def rnn_step_backward(dnext_h, cache):
  """
  Backward pass for a single timestep of a vanilla RNN.
  
  Inputs:
  - dnext_h: Gradient of loss with respect to next hidden state
  - cache: Cache object from the forward pass
  
  Returns a tuple of:
  - dx: Gradients of input data, of shape (N, D)
  - dprev_h: Gradients of previous hidden state, of shape (N, H)
  - dWx: Gradients of input-to-hidden weights, of shape (N, H)
  - dWh: Gradients of hidden-to-hidden weights, of shape (H, H)
  - db: Gradients of bias vector, of shape (H,)
  """
  dx, dprev_h, dWx, dWh, db = None, None, None, None, None
  ##############################################################################
  # TODO: Implement the backward pass for a single step of a vanilla RNN.      #
  #                                                                            #
  # HINT: For the tanh function, you can compute the local derivative in terms #
  # of the output value from tanh.                                             #
  ##############################################################################
  x, Wx, Wh, prev_h, next_h = cache
  dtanh = 1 - next_h**2
  dx = (dnext_h*dtanh).dot(Wx.T)
  dWx = x.T.dot(dnext_h*dtanh)
  dprev_h = (dnext_h*dtanh).dot(Wh.T)
  dWh = prev_h.T.dot(dnext_h*dtanh)
  db = np.sum(dnext_h*dtanh,axis=0)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dx, dprev_h, dWx, dWh, db

单层RNN神经元行为

x（N，T，D）表示N样本的batch中有T个字符向量，每个响亮H维度。

RNN输出有两个方向，一个向上一层（输出层），一个向同层下一个时序，所以反向传播时两个梯度需要相加，输出层梯度可以直接求出（或是上一层中递归求出），所以使用dh(N,T,H)保存好，而同层时序梯度必须在同层中递归计算。

正向传播

def rnn_forward(x, h0, Wx, Wh, b):
  """
  Run a vanilla RNN forward on an entire sequence of data. We assume an input
  sequence composed of T vectors, each of dimension D. The RNN uses a hidden
  size of H, and we work over a minibatch containing N sequences. After running
  the RNN forward, we return the hidden states for all timesteps.
  
  Inputs:
  - x: Input data for the entire timeseries, of shape (N, T, D).
  - h0: Initial hidden state, of shape (N, H)
  - Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
  - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
  - b: Biases of shape (H,)
  
  Returns a tuple of:
  - h: Hidden states for the entire timeseries, of shape (N, T, H).
  - cache: Values needed in the backward pass
  """
  h, cache = None, None
  ##############################################################################
  # TODO: Implement forward pass for a vanilla RNN running on a sequence of    #
  # input data. You should use the rnn_step_forward function that you defined  #
  # above.                                                                     #
  ##############################################################################
  N, T, D = x.shape
  _, H = h0.shape
  h = np.zeros((N,T,H))
  h_next = h0
  cache = []
  for i in range(T):
    h[:,i,:], cache_next = rnn_step_forward(x[:,i,:], h_next, Wx, Wh, b)
    h_next = h[:,i,:]
    cache.append(cache_next)
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return h, cache

单层RNN反向传播

def rnn_backward(dh, cache):
  """
  Compute the backward pass for a vanilla RNN over an entire sequence of data.
  
  Inputs:
  - dh: Upstream gradients of all hidden states, of shape (N, T, H)
  
  Returns a tuple of:
  - dx: Gradient of inputs, of shape (N, T, D)
  - dh0: Gradient of initial hidden state, of shape (N, H)
  - dWx: Gradient of input-to-hidden weights, of shape (D, H)
  - dWh: Gradient of hidden-to-hidden weights, of shape (H, H)
  - db: Gradient of biases, of shape (H,)
  """
  dx, dh0, dWx, dWh, db = None, None, None, None, None
  ##############################################################################
  # TODO: Implement the backward pass for a vanilla RNN running an entire      #
  # sequence of data. You should use the rnn_step_backward function that you   #
  # defined above.                                                             #
  ##############################################################################
  x, Wx, Wh, prev_h, next_h = cache[-1]
  _, D = x.shape
  N, T, H = dh.shape
  dx = np.zeros((N,T,D))
  dh0 = np.zeros((N,H))
  dWx = np.zeros((D,H))
  dWh = np.zeros((H,H))
  db = np.zeros(H)
  dprev_h_ = np.zeros((N,H))
  for i in range(T-1,-1,-1):
    dx_, dprev_h_, dWx_, dWh_, db_ = rnn_step_backward(dh[:,i,:] + dprev_h_, cache.pop())
    dx[:,i,:] = dx_
    dh0 = dprev_h_
    dWx += dWx_
    dWh += dWh_
    db += db_
  ##############################################################################
  #                               END OF YOUR CODE                             #
  ##############################################################################
  return dx, dh0, dWx, dWh, db

图像标注过程理解

正向传播流程如下，

几个有意思的点

字符和向量的映射

涉及两个映射，

一个是caption_in到输出节点维度向量的映射，映射矩阵是需要学习的参数
一个是输出节点向量到字符的映射，这里面有专门的映射函数，输出节点本身是变化的（被学习的）

第一个映射：

caption_in和caption_out是输入和标准（caption_in=caption[:-1],caption_out=caption[1:]），不考虑batch的话是一维数组，通过We矩阵可以映射到字符向量空间，转换以及反向传播过程如下，

def word_embedding_forward(x, W):
  """
  Forward pass for word embeddings. We operate on minibatches of size N where
  each sequence has length T. We assume a vocabulary of V words, assigning each
  to a vector of dimension D.
  
  Inputs:
  - x: Integer array of shape (N, T) giving indices of words. Each element idx
    of x muxt be in the range 0 <= idx < V.
  - W: Weight matrix of shape (V, D) giving word vectors for all words.
  
  Returns a tuple of:
  - out: Array of shape (N, T, D) giving word vectors for all input words.
  - cache: Values needed for the backward pass
  """

  out = W[x, :]
  cache = (W, x)
  
  return out, cache

反向传播注意，这不是个标准意义上的链式传播的门，按照逻辑分析这个映射过程的梯度是叠加的，注意函数np.func.at()的用法

def word_embedding_backward(dout, cache):
  """
  Backward pass for word embeddings. We cannot back-propagate into the words
  since they are integers, so we only return gradient for the word embedding
  matrix.
  
  HINT: Look up the function np.add.at
  
  Inputs:
  - dout: Upstream gradients of shape (N, T, D)
  - cache: Values from the forward pass
  
  Returns:
  - dW: Gradient of word embedding matrix, of shape (V, D).
  """
  W, x = cache
  dW = np.zeros_like(W)
  #dW[x] += dout # this will not work, see the doc of np.add.at
  np.add.at(dW, x, dout)
  
  return dW

第二个映射：

正常的多维y=xW计算，

y = x.reshape(x.shape[0], -1).dot(w) + b # 保留N，后面的数据化为一维

这里的y=xW计算，

y = x.reshape(N * T, D).dot(w).reshape(N, T, M) + b # 其实使用y = x.dot(w) + b 效果是一样的，自动广播到最低维度

上面问题不大，问题在求梯度的时候，两者处理有一定差别，注意到这一点的话只要在演算的时候较对好各个维度的值就好了（保证相乘的两项维度可以相乘，而且理论结果维度和算式相符）

情况2的代码，

def affine_forward(x, w, b):
  """
  Computes the forward pass for an affine (fully-connected) layer.

  The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input.
  We multiply this against a weight matrix of shape (D, M) where
  D = \prod_i d_i

  Inputs:
  x - Input data, of shape (N, d_1, ..., d_k)
  w - Weights, of shape (D, M)
  b - Biases, of shape (M,)
  
  Returns a tuple of:
  - out: output, of shape (N, M)
  - cache: (x, w, b)
  """
  out = x.reshape(x.shape[0], -1).dot(w) + b
  cache = (x, w, b)
  return out, cache


def affine_backward(dout, cache):
  """
  Computes the backward pass for an affine layer.

  Inputs:
  - dout: Upstream derivative, of shape (N, M)
  - cache: Tuple of:
    - x: Input data, of shape (N, d_1, ... d_k)
    - w: Weights, of shape (D, M)

  Returns a tuple of:
  - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
  - dw: Gradient with respect to w, of shape (D, M)
  - db: Gradient with respect to b, of shape (M,)
  """
  x, w, b = cache
  dx = dout.dot(w.T).reshape(x.shape)
  dw = x.reshape(x.shape[0], -1).T.dot(dout)
  db = np.sum(dout, axis=0)
  return dx, dw, db