神经网络与深度学习(邱锡鹏)编程练习4 FNN 简单神经网络 Jupyter导出版 numpy
GitHub - nndl/nndl-exercise-ans: Solutions for nndl/exercise
答案中,x的定义需要前置。否则会提示x未定义。
可能是Python、tf版本不同吧。
相信原有答案实验测试没问题。两年过去了,有些小问题需要微调。
准备数据
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers, datasets
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # or any {'0', '1', '2'}
def mnist_dataset():
(x, y), (x_test, y_test) = datasets.mnist.load_data()
#normalize
x = x/255.0
x_test = x_test/255.0
return (x, y), (x_test, y_test)
Demo numpy based auto differentiation
import numpy as np
class Matmul:
def __init__(self):
self.mem = {}
def forward(self, x, W):
h = np.matmul(x, W)
self.mem={'x': x, 'W':W}
return h
def backward(self, grad_y):
'''
x: shape(N, d)
w: shape(d, d')
grad_y: shape(N, d')
'''
x = self.mem['x']
W = self.mem['W']
####################
'''计算矩阵乘法的对应的梯度'''
grad_x = np.matmul(grad_y, W.T) # shape(N, b)
grad_W = np.matmul(x.T, grad_y)
####################
return grad_x, grad_W
class Relu:
def __init__(self):
self.mem = {}
def forward(self, x):
self.mem['x']=x
return np.where(x > 0, x, np.zeros_like(x))
def backward(self, grad_y):
'''
grad_y: same shape as x
'''
####################
'''计算relu 激活函数对应的梯度'''
x = self.mem['x']
grad_x = (x > 0).astype(np.float32) * grad_y
####################
return grad_x
class Softmax:
'''
softmax over last dimention
'''
def __init__(self):
self.epsilon = 1e-12
self.mem = {}
def forward(self, x):
'''
x: shape(N, c)
'''
x_exp = np.exp(x)
partition = np.sum(x_exp, axis=1, keepdims=True)
out = x_exp/(partition+self.epsilon)
self.mem['out'] = out
self.mem['x_exp'] = x_exp
return out
def backward(self, grad_y):
'''
grad_y: same shape as x
'''
s = self.mem['out']
sisj = np.matmul(np.expand_dims(s,axis=2), np.expand_dims(s, axis=1)) # (N, c, c)
g_y_exp = np.expand_dims(grad_y, axis=1)
tmp = np.matmul(g_y_exp, sisj) #(N, 1, c)
tmp = np.squeeze(tmp, axis=1)
tmp = -tmp+grad_y*s
return tmp
class Log:
'''
softmax over last dimention
'''
def __init__(self):
self.epsilon = 1e-12
self.mem = {}
def forward(self, x):
'''
x: shape(N, c)
'''
out = np.log(x+self.epsilon)
self.mem['x'] = x
return out
def backward(self, grad_y):
'''
grad_y: same shape as x
'''
x = self.mem['x']
return 1./(x+1e-12) * grad_y
Gradient check
# import tensorflow as tf
# x = np.random.normal(size=[5, 6])
# W = np.random.normal(size=[6, 4])
# aa = Matmul()
# out = aa.forward(x, W) # shape(5, 4)
# grad = aa.backward(np.ones_like(out))
# print (grad)
# with tf.GradientTape() as tape:
# x, W = tf.constant(x), tf.constant(W)
# tape.watch(x)
# y = tf.matmul(x, W)
# loss = tf.reduce_sum(y)
# grads = tape.gradient(loss, x)
# print (grads)
# import tensorflow as tf
# x = np.random.normal(size=[5, 6])
# aa = Relu()
# out = aa.forward(x) # shape(5, 4)
# grad = aa.backward(np.ones_like(out))
# print (grad)
# with tf.GradientTape() as tape:
# x= tf.constant(x)
# tape.watch(x)
# y = tf.nn.relu(x)
# loss = tf.reduce_sum(y)
# grads = tape.gradient(loss, x)
# print (grads)
# import tensorflow as tf
# x = np.random.normal(size=[5, 6], scale=5.0, loc=1)
# label = np.zeros_like(x)
# label[0, 1]=1.
# label[1, 0]=1
# label[1, 1]=1
# label[2, 3]=1
# label[3, 5]=1
# label[4, 0]=1
# print(label)
# aa = Softmax()
# out = aa.forward(x) # shape(5, 6)
# grad = aa.backward(label)
# print (grad)
# with tf.GradientTape() as tape:
# x= tf.constant(x)
# tape.watch(x)
# y = tf.nn.softmax(x)
# loss = tf.reduce_sum(y*label)
# grads = tape.gradient(loss, x)
# print (grads)
# import tensorflow as tf
# x = np.random.normal(size=[5, 6])
# aa = Log()
# out = aa.forward(x) # shape(5, 4)
# grad = aa.backward(label)
# print (grad)
# with tf.GradientTape() as tape:
# x= tf.constant(x)
# tape.watch(x)
# y = tf.math.log(x)
# loss = tf.reduce_sum(y*label)
# grads = tape.gradient(loss, x)
# print (grads)
Final Gradient Check
import tensorflow as tf
x = np.random.normal(size=[5, 6])
W1 = np.random.normal(size=[6, 5])
W2 = np.random.normal(size=[5, 6])
label = np.zeros_like(x)
label[0, 1]=1.
label[1, 0]=1
label[2, 3]=1
label[3, 5]=1
label[4, 0]=1
mul_h1 = Matmul()
mul_h2 = Matmul()
relu = Relu()
softmax = Softmax()
log = Log()
h1 = mul_h1.forward(x, W1) # shape(5, 4)
h1_relu = relu.forward(h1)
h2 = mul_h2.forward(h1_relu, W2)
h2_soft = softmax.forward(h2)
h2_log = log.forward(h2_soft)
h2_log_grad = log.backward(label)
h2_soft_grad = softmax.backward(h2_log_grad)
h2_grad, W2_grad = mul_h2.backward(h2_soft_grad)
h1_relu_grad = relu.backward(h2_grad)
h1_grad, W1_grad = mul_h1.backward(h1_relu_grad)
print(h2_log_grad)
print('--'*20)
# print(W2_grad)
with tf.GradientTape() as tape:
x, W1, W2, label = tf.constant(x), tf.constant(W1), tf.constant(W2), tf.constant(label)
tape.watch(W1)
tape.watch(W2)
h1 = tf.matmul(x, W1)
h1_relu = tf.nn.relu(h1)
h2 = tf.matmul(h1_relu, W2)
prob = tf.nn.softmax(h2)
log_prob = tf.math.log(prob)
loss = tf.reduce_sum(label * log_prob)
grads = tape.gradient(loss, [prob])
print (grads[0].numpy())
[[0.00000000e+00 5.93789528e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[6.65507591e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.65233390e+04
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 7.02646772e+02]
[2.88022415e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]]
----------------------------------------
[[0.00000000e+00 5.93789528e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[6.65507591e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.65233393e+04
0.00000000e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 7.02646772e+02]
[2.88022415e+01 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]]
建立模型
class myModel:
def __init__(self):
self.W1 = np.random.normal(size=[28*28+1, 100])
self.W2 = np.random.normal(size=[100, 10])
self.mul_h1 = Matmul()
self.mul_h2 = Matmul()
self.relu = Relu()
self.softmax = Softmax()
self.log = Log()
def forward(self, x):
x = x.reshape(-1, 28*28)
bias = np.ones(shape=[x.shape[0], 1])
x = np.concatenate([x, bias], axis=1)
self.h1 = self.mul_h1.forward(x, self.W1) # shape(5, 4)
self.h1_relu = self.relu.forward(self.h1)
self.h2 = self.mul_h2.forward(self.h1_relu, self.W2)
self.h2_soft = self.softmax.forward(self.h2)
self.h2_log = self.log.forward(self.h2_soft)
def backward(self, label):
self.h2_log_grad = self.log.backward(-label)
self.h2_soft_grad = self.softmax.backward(self.h2_log_grad)
self.h2_grad, self.W2_grad = self.mul_h2.backward(self.h2_soft_grad)
self.h1_relu_grad = self.relu.backward(self.h2_grad)
self.h1_grad, self.W1_grad = self.mul_h1.backward(self.h1_relu_grad)
model = myModel()
计算 loss
def compute_loss(log_prob, labels):
return np.mean(np.sum(-log_prob*labels, axis=1))
def compute_accuracy(log_prob, labels):
predictions = np.argmax(log_prob, axis=1)
truth = np.argmax(labels, axis=1)
return np.mean(predictions==truth)
def train_one_step(model, x, y):
model.forward(x)
model.backward(y)
model.W1 -= 1e-5* model.W1_grad
model.W2 -= 1e-5* model.W2_grad
loss = compute_loss(model.h2_log, y)
accuracy = compute_accuracy(model.h2_log, y)
return loss, accuracy
def test(model, x, y):
model.forward(x)
loss = compute_loss(model.h2_log, y)
accuracy = compute_accuracy(model.h2_log, y)
return loss, accuracy
实际训练
train_data, test_data = mnist_dataset()
train_label = np.zeros(shape=[train_data[0].shape[0], 10])
test_label = np.zeros(shape=[test_data[0].shape[0], 10])
train_label[np.arange(train_data[0].shape[0]), np.array(train_data[1])] = 1.
test_label[np.arange(test_data[0].shape[0]), np.array(test_data[1])] = 1.
for epoch in range(50):
loss, accuracy = train_one_step(model, train_data[0], train_label)
print('epoch', epoch, ': loss', loss, '; accuracy', accuracy)
loss, accuracy = test(model, test_data[0], test_label)
print('test loss', loss, '; accuracy', accuracy)
epoch 0 : loss 24.120447550872193 ; accuracy 0.10506666666666667
epoch 1 : loss 23.556807793137484 ; accuracy 0.12285
epoch 2 : loss 23.048235414054865 ; accuracy 0.14381666666666668
epoch 3 : loss 22.681964782819378 ; accuracy 0.15748333333333334
epoch 4 : loss 22.31934050264306 ; accuracy 0.16823333333333335
epoch 5 : loss 21.816533326873117 ; accuracy 0.17876666666666666
epoch 6 : loss 21.290418972328762 ; accuracy 0.1923
epoch 7 : loss 20.864675481620193 ; accuracy 0.21083333333333334
epoch 8 : loss 20.40118101318322 ; accuracy 0.22231666666666666
epoch 9 : loss 19.688677507169867 ; accuracy 0.24261666666666667
epoch 10 : loss 18.250652141064315 ; accuracy 0.28105
epoch 11 : loss 17.413139190206394 ; accuracy 0.3168166666666667
epoch 12 : loss 15.905288691015508 ; accuracy 0.3589
epoch 13 : loss 14.97563324371492 ; accuracy 0.4026
epoch 14 : loss 13.90615214978969 ; accuracy 0.4339166666666667
epoch 15 : loss 12.928732933159907 ; accuracy 0.4564666666666667
epoch 16 : loss 12.316841190099938 ; accuracy 0.4794
epoch 17 : loss 11.738809927940636 ; accuracy 0.5045
epoch 18 : loss 10.766245553596033 ; accuracy 0.54675
epoch 19 : loss 10.241306013662607 ; accuracy 0.5598666666666666
epoch 20 : loss 10.17684778741172 ; accuracy 0.57345
epoch 21 : loss 9.69722437744703 ; accuracy 0.5863
epoch 22 : loss 9.583569462976824 ; accuracy 0.5949833333333333
epoch 23 : loss 9.412803692259162 ; accuracy 0.5995
epoch 24 : loss 9.32294046081438 ; accuracy 0.6035833333333334
epoch 25 : loss 9.27861330368951 ; accuracy 0.6066666666666667
epoch 26 : loss 8.635697269031605 ; accuracy 0.6255666666666667
epoch 27 : loss 8.270694241139738 ; accuracy 0.6350166666666667
epoch 28 : loss 7.981981911444959 ; accuracy 0.6392666666666666
epoch 29 : loss 7.7815894381357475 ; accuracy 0.6464666666666666
epoch 30 : loss 7.319034580141721 ; accuracy 0.6605166666666666
epoch 31 : loss 6.990317954569284 ; accuracy 0.673
epoch 32 : loss 6.676015560368183 ; accuracy 0.6792166666666667
epoch 33 : loss 6.338457483699933 ; accuracy 0.6994166666666667
epoch 34 : loss 6.18231834020112 ; accuracy 0.6971666666666667
epoch 35 : loss 5.8601951694351895 ; accuracy 0.721
epoch 36 : loss 5.5723800515554 ; accuracy 0.72185
epoch 37 : loss 5.345872925131976 ; accuracy 0.7413166666666666
epoch 38 : loss 5.1037465470389805 ; accuracy 0.74295
epoch 39 : loss 4.998503318083353 ; accuracy 0.7547666666666667
epoch 40 : loss 4.7739352219431614 ; accuracy 0.7584333333333333
epoch 41 : loss 4.684302398248045 ; accuracy 0.7667
epoch 42 : loss 4.528539225806665 ; accuracy 0.7697833333333334
epoch 43 : loss 4.430669532636246 ; accuracy 0.7773
epoch 44 : loss 4.299548004077121 ; accuracy 0.7803333333333333
epoch 45 : loss 4.1827985220379515 ; accuracy 0.7884333333333333
epoch 46 : loss 4.066019079244932 ; accuracy 0.7913333333333333
epoch 47 : loss 3.9536817935945083 ; accuracy 0.7982333333333334
epoch 48 : loss 3.867862412250489 ; accuracy 0.7999333333333334
epoch 49 : loss 3.779043032271536 ; accuracy 0.80545
test loss 3.4802052462670967 ; accuracy 0.819