基于Excel的神经网络工具箱（之二）——前向传播及后向传播的算法VBA代码实现

基于Excel的神经网络工具箱（之二）——ANN前向传播FP及反向传播算法BP的VBA算法实现

神经网络数据结构
前向传播算法
- 单层感知器的感知结果计算
- 感知器结果的前向传播
反向传播算法的实现

本系列文章的其他部分链接如下：

神经网络数据结构

在上一篇文章中，我已经给出了ANN Toolbox中的完整神经网络数据结构的相关代码，包括数据结构定义、基本的单元、层、神经网络的创建、初始化、操作、删除以及单元计算的算法，详情可以参见前一篇文章。

前向传播算法

前向传播是ANN的“感知”过程。简单地讲，ANN的第一层与我们的输入数据紧密相连，这一层的每一个单元都与输入数据完全连接，而每一个单元是否激活就取决于单元的每一个连接权值和输入的加权平均，加上偏置bias值。公式如下：
$A(\sum_i^N x_i * w_i + b)$
其中 $A (x)$ 为单元的激活函数
在ANN Toolbox中，代码如下：

Private Function calcUnit(unit As NNUnit, vec() As Double, f As Byte, Optional d As Byte = 1) As Double
' calculates the output of a unit, returns single double result, this function is used for FULL-CONNECTION layer
' f as predefined type of unit activation function
' d as dimension of unit weights array
' calculation is feasible only when dimension and size of vec is the same as those of unit weights, _
    but it is not verified before calculation
Dim wc As Long, vc As Long, dimension As Byte
Dim i As Long, j As Long, k As Long
Dim m As Long, n As Long, o As Long
Dim result As Double

On Error GoTo errorHandler
    With unit
        result = 0
        Select Case d
        Case 1
            For i = 1 To UBound(.w, 1)
                result = result + .w(i) * vec(i)
            Next
        Case 2
            For i = 1 To UBound(.w, 1)
                For j = 1 To UBound(.w, 2)
                    result = result + .w(i, j) * vec(i, j)
                Next
            Next
        Case 3
            For i = 1 To UBound(.w, 1)
                For j = 1 To UBound(.w, 2)
                    For k = 1 To UBound(.w, 3)
                        result = result + .w(i, j, k) * vec(i, j, k)
                    Next
                Next
            Next
        End Select
        result = result + .b
        result = UTFunction(f, result)
    End With
    calcUnit = result
    Exit Function
errorHandler:
    calcUnit = 0
    MsgBox "Calculating Unit Error!"
End Function

单层感知器的感知结果计算

上面的代码计算 $\sum_i^N x_i * w_i + b$ 的值，并把结果传入TUFunction()函数中，这是单元的激活函数，计算最终的激活值：

Private Function UTFunction(fType As Byte, X As Double) As Double
' this function calculates the Unit Transfering Function
' to prevent from overflow error, input x should be larger than 700
    If X > -700 And X < 700 Then
        Select Case fType
        Case UTFLOG
            UTFunction = 1 / (1 + Exp(-1 * X))
        Case UTFTanH
            'UTFunction = Tanh(x)
            UTFunction = (1 - Exp(-1 * X)) / (1 + Exp(-1 * X))
        Case UTFReL
            UTFunction = max(0, X)
        Case UTFLkyReL
            UTFunction = max(0, X) + 0.01 * min(0, X)
        Case UTFLin
            UTFunction = X
        Case UTFSoftMax
            ' softmax is calculated later when results of all units are calculated, here only gives an intermediate result
            UTFunction = Exp(X)
        End Select
    ElseIf X < -700 Then 'if x<-700 exp function will return "overflow" error, then value should be calculated manually
        Select Case fType
        Case UTFLOG
            UTFunction = 0
        Case UTFTanH
            'UTFunction = Tanh(x)
            UTFunction = -1
        Case UTFReL
            UTFunction = 0
        Case UTFLkyReL
            UTFunction = 0.01 * X
        Case UTFLin
            UTFunction = X
        Case UTFSoftMax
            UTFunction = 0
       End Select
    ElseIf X > 700 Then
        Select Case fType
        Case UTFLOG
            UTFunction = 1
        Case UTFTanH
            'UTFunction = Tanh(x)
            UTFunction = 1
        Case UTFReL
            UTFunction = X
        Case UTFLkyReL
            UTFunction = X
        Case UTFLin
            UTFunction = X
        Case UTFSoftMax
            UTFunction = 1E+200
       End Select
    End If
End Function

感知器结果的前向传播

第一层感知器从输入接受到信号后，运用激活函数计算出第一层的输出，这时会将自己的输出传入下一层，下一层感知器运用完全相同的法则，计算下一层的激活值，并且逐级传递，一直到最后的输出层：
这个过程，在ANN Toolbox中使用了NNFP()函数实现：

Public Function NNFP(net As NN, vec() As Double, output() As MultiArr) As Long
' get foreward calculation result - to calculate one step the output of a NN given input vector, output output vector of all layers
' vec() is the input vector, function returns the classification of final output if size of output is larger than 1

Dim vecSize As Long, outputSize As Long
Dim curUnit As NNUnit, utf As Byte, num As Double
Dim i As Long, j As Long, n As Long, m As Long

    vecSize = UBound(vec)
    With net
        '''' set up the structure of multi array output()
        n = .hlayercount + 1
        ReDim output(0 To n)
        For i = 0 To n
            ''  gets the number of units for each layer
            Select Case i
            Case 0
                m = vecSize         ' input is stored in output(0)
            Case n
                m = UBound(.OutLayer.units)
            Case Else
                m = UBound(.hLayer(i).units)
            End Select
            ReDim output(i).d(1 To m)
        Next
        
        '' start calculation of neural network outputs
        If vecSize = .inSize Then
            output(0).d = vec
            For i = 1 To n                                  'calculate output layer by layer, including last layer = outlayer, and store result of each layer in the multi array
                If i < n Then
                    outputSize = UBound(.hLayer(i).units)   'output size = either the unit count of a hidden layer, or the NNUnit count of output layer
                    utf = .hLayer(i).utf                    'get the unit transfering function for the layer
                Else
                    outputSize = UBound(.OutLayer.units)
                    utf = .OutLayer.utf
                End If
                For j = 1 To outputSize
                    If i <= .hlayercount Then
                        curUnit = .hLayer(i).units(j)
                    Else
                        curUnit = .OutLayer.units(j)
                    End If
                    output(i).d(j) = calcUnit(curUnit, output(i - 1).d, utf)
                Next
            Next
        End If
        ' findout the largest value in output vector, and return the class represented by the largest _
            calculate output value of outlayer if outlayer is softmax
        j = 1
        If outputSize > 1 Then
            If .OutLayer.utf = UTFSoftMax Then
                num = 0
                For i = 1 To outputSize
                    num = num + output(n).d(i)
                    If output(n).d(j) < output(n).d(i) Then j = i
                Next
                ' calculate the value of softmax, overflow if num = 0
                If num <> 0 Then
                    For i = 1 To outputSize
                        output(n).d(i) = output(n).d(i) / num
                    Next
                End If
            Else
                For i = 2 To outputSize
                    If output(n).d(j) < output(n).d(i) Then j = i
                Next
            End If
        End If
        NNFP = j
    End With
End Function

这个函数仅适用于全连接网络的计算，因为CNN的卷积计算需要用到不同的逻辑，因此CNNFP的算法稍有不同。但思想是一致的，根据输入层的数据，逐级从前到后计算所有神经元的激活值，并输出到下一层。对CNN来说，不同的是数据维度不同。

反向传播算法的实现

神经网络完成一次前向传播后，就已经完成了输出了。如果是训练好的网络，它的权值已经高度适应相关的输入信号，能够在特定的神经元中产生特定的激活信号，最终在输出层产生有“意义”的输出，这时候我们是不需要后向传播或反向传播算法的。

反向传播算法用在神经元的训练上。如果是没有经过训练的神经网络，它的输出将会是无意义的随机值。这时，我们的训练方法是所谓的“有监督学习”。意思是说，我们在给神经网络特定的输入时，让神经网络首先产生随机的输出，但同时，我们直接告诉神经网络“正确”的输出是什么。通过计算网络实际输出和“正确”输出之间的差值（我们称为误差 $\delta$ ），我们能够推导出神经网络每一层的神经元距离应该输出正确值的权值的误差。不过这个误差是从最后一层（输出层）一级级反推出来的：首先计算输出的误差，反推出倒数第二层的误差、再反推出倒数第三层的误差……最后得出整个网络的误差。

得到整个网络的误差后，我们会按照这个误差的方向“稍微”调整一下各个神经元的权值，再进行下一轮的“学习”

这就是神经网络学习的过程，因为误差是沿着数据输入的路径反向传播的，因此得名反向传播算法。

误差和梯度的计算

ANN Toolbox中的误差和梯度的计算非常依赖与NNGrad()函数，在这个函数中，首先计算网络输出值与目标值之间的差

但是，计算出差值之后，还需要计算输出层每个单元的偏导数，这需要用到激活函数的求导函数，这个函数定义为dUTFunction()

Private Function dUTFunction(fType As Byte, X As Double) As Double
' this function calculates the derived function of the unit transfering function
' for logistic sigmoid and hyperbolic tangent functions the input X = f(x)
    Select Case fType
    Case UTFLOG
        dUTFunction = X * (1 - X)
    Case UTFTanH
        dUTFunction = (1 + X) * (1 - X)
    Case UTFReL     ' according to Excel manual calc, d-function of ReLU is 1
        'If X > 0 Then
        dUTFunction = 1
        'Else
        '    dUTFunction = 0
        'End If
    Case UTFLkyReL
        If X > 0 Then
            dUTFunction = 1
        Else
            dUTFunction = 0.01
        End If
    Case UTFLin
        dUTFunction = 1
    Case UTFSoftMax
        dUTFunction = 1
    Case Else   ' for MaxPool and AvgPool
        dUTFunction = 1
    End Select
End Function

利用上面的函数对单元求导，以这个方向作为梯度下降的方向，计算梯度，下面是NNGrad()函数的定义

Private Function NNGrad(net As NN, Gradient() As TripleArr, Jacobian() As TripleArr, Funcw() As TripleArr) As Double
' another important function that calculates the error value and then gradient of a net
' gradient of a net is the basis of almost all training algoriths
' gradient is calculated and transferred by ref directly to the input parameter Gradient()
' Jacobian is calculated and passed by-Ref directly to the input parameter Jacobian()
' Error function is passed by-Ref directly to the input parameter ErrorFunction
' total errorvalue is returned as function value

Dim delta() As Double, vec() As Double, net0 As NN
Dim output() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double, target() As Double
Dim b As Double, num As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, sampleCount As Long
Dim layerSize As Long, layercount As Long, curSample As Long, inSize As Long
Dim preLayer As layer
Dim curLayer As layer
Dim curDelta As MultiArr
Dim preDelta As MultiArr        ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, utf As Byte

Dim unitSize As Long


    layercount = getLayerCount(net)
    
    With net
        outSize = UBound(.OutLayer.units)
        sampleCount = trainSampleCount + validationSampleCount
        labelSize = targetSampleSize
        vecSize = .inSize
            
        For curSample = 1 To sampleCount   ' calculate the grad for all samples
            '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
            '' section 1: alculate the grad of outlayer
            vec = InputData(curSample).d        ' read the input vec from input vec set
            target = targetData(curSample).d    'read the target from target set
            ReDim curDelta.d(1 To labelSize)
            utf = .OutLayer.utf
            NNFP net, vec, output        ' calculate the outputs of net
            For i = 1 To outSize            ' calculates the error of output by calculating value of target - output for each output unit
                out = output(layercount).d(i)
                derive(layercount).d(i) = dUTFunction(utf, out)
                tar = target(i)
                num = (tar - out)
                Funcw(layercount).m(i).d(curSample) = num
                curDelta.d(i) = num * derive(layercount).d(i)      ' delta is calculated via dfunction of utf
                Jacobian(layercount).m(i).d(curSample, 0) = derive(layercount).d(i)
                For k = 1 To UBound(.OutLayer.units(1).w)
                    Jacobian(layercount).m(i).d(curSample, k) = derive(layercount).d(i) * output(layercount - 1).d(k)
                Next k
            Next i
            If layercount > 1 Then          ' there are hidden layers, delta should be calculated layer by layer
            
                '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                '' Section 2: calculate the delta value of hidden layers
                preLayer = .OutLayer                    ''''' let prelayer = outlayer
                                                        'to calculate delta of current layer, the whole previous layer should be used to calculate
                preDelta = curDelta       ''''' let predelta = delta of outlayer
                '''''''''''''''''''''
                For i = layercount - 1 To 1 Step -1     ' calculate from the last hidden layer back to the first hidden layer
                    curLayer = .hLayer(i)
                    utf = curLayer.utf
                    layerSize = UBound(curLayer.units)
                    ReDim curDelta.d(1 To layerSize)
                    For j = 1 To layerSize              ' calculate delta for all units in curlayer based on previous layer
                        cur = 0                         '  prepare to calculate the sum product
                        For k = 1 To UBound(preDelta.d) ' delta for each units is based on the sum of product of predelta and weight of the unit
                            cur = cur + preDelta.d(k) * preLayer.units(k).w(j)
                        Next k
                        Funcw(i).m(j).d(curSample) = cur
                        out = output(i).d(j)
                        derive(i).d(j) = dUTFunction(utf, out)
                        num = cur * derive(i).d(j)      ' delta value for each level is determined by the d function of utf of each layer
                        curDelta.d(j) = num
                        Jacobian(i).m(j).d(curSample, 0) = derive(i).d(j) 'bias
                        For k = 1 To UBound(.hLayer(i).units(1).w)
                            Jacobian(i).m(j).d(curSample, k) = derive(i).d(j) * output(i - 1).d(k)
                        Next
                    Next j
                    preLayer = curLayer
                    preDelta = curDelta
                Next i
            End If
        Next curSample
        num = 0
        For i = 1 To layercount     ' calculate gradient
            If i < layercount Then
                layerSize = UBound(.hLayer(i).units)
            Else
                layerSize = outSize
            End If
            For j = 1 To layerSize
                Gradient(i).m(j).d = mMulti(Jacobian(i).m(j).d, Funcw(i).m(j).d, True, , , 1)
                num = num + mMulti(Funcw(i).m(j).d, Funcw(i).m(j).d, True, , 1, 1)
            Next
        Next
    
    End With
    NNGrad = num / validationSampleCount
    
End Function

梯度从后往前的传播

每一层的梯度算出来后，向前反向传播，并接着计算前一层的梯度，这是通过NNBP()函数实现的：

Public Function NNBP(net As NN, vec() As Double, target() As Double, Ate As Double, Optional m As Double = 0, Optional ClsCor As Byte = 0) As Double
' This function provides stochastic gradiant decendant algorithms with weiths update base on one sample _
    of training vector
' This function returns total error value before NNBP
' complete one step FP and BP updates of all weights, return updated NN
' vec() as input vector, label() as label of input vector, Ate as learning speed, net as net
Dim delta() As Double, previousDelta() As Double
Dim output() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double
Dim b As Double, num As Double, ErrorValue As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, layercount As Long, outClass As Long, tgtClass As Long
Dim layerSize As Long
Dim preLayer As layer
Dim curLayer As layer
Dim layerDelta() As MultiArr ' stores all delta value in a group of deltas
Dim curDelta As MultiArr
Dim preDelta As MultiArr   ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, utf As Byte

    layercount = getLayerCount(net)
    ErrorValue = 0                      ' calculate the total error
    ReDim layerDelta(1 To layercount)   ' set the layer delta structure
    If m > 0 Then
        deltaW = allZeroNN
    End If
    With net
        outSize = UBound(.OutLayer.units)
        labelSize = UBound(target)
        vecSize = UBound(vec)
        If labelSize = outSize Then         ' checks if the size of label vector is the same as size of output vector
        
            '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
            '' start to calculate the delta value of outlayer
            ReDim curDelta.d(1 To labelSize)
            utf = .OutLayer.utf
            outClass = NNFP(net, vec, output)       ' calculate the outputs of net
            tgtClass = 1
            tar = 0
            For i = 1 To outSize            ' calculates the error of output by calculating value of target - output for each output unit
                out = output(.hlayercount + 1).d(i)
                If target(i) > tar Then tgtClass = i
                tar = target(i)
                num = (tar - out)
                ErrorValue = ErrorValue + num * num     ' calculate total error value
                curDelta.d(i) = num * dUTFunction(utf, out)      ' delta is calculated via dfunction of utf
            Next
            If outClass = tgtClass Then ClsCor = 1
            ErrorValue = ErrorValue / 2
            layerDelta(layercount) = curDelta
            If layercount > 1 Then          ' there are hidden layers, delta should be calculated layer by layer
            
                '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                '' start to calculate the delta value of hidden layers
                preLayer = .OutLayer                    ''''' let prelayer = outlayer
                                                        'to calculate delta of current layer, the whole previous layer should be used to calculate
                preDelta = layerDelta(layercount)       ''''' let predelta = delta of outlayer
                '''''''''''''''''''''
                For i = layercount - 1 To 1 Step -1     ' calculate from the last hidden layer back to the first hidden layer
                    curLayer = .hLayer(i)
                    utf = curLayer.utf
                    layerSize = UBound(curLayer.units)
                    ReDim curDelta.d(1 To layerSize)
                    For j = 1 To layerSize              ' calculate delta for all units in curlayer based on previous layer
                        cur = 0                         '  prepare to calculate the sum product
                        For k = 1 To UBound(preDelta.d) ' delta for each units is based on the sum of product of predelta and weight of the unit
                            cur = cur + preDelta.d(k) * preLayer.units(k).w(j)
                        Next
                        out = output(i).d(j)
                        num = cur * dUTFunction(utf, out) ' delta value for each level is determined by the d function of utf of each layer
                        curDelta.d(j) = num
                    Next
                    preLayer = curLayer
                    preDelta = curDelta
                    layerDelta(i) = curDelta
                Next
                '''''''''''' All deltas for each unit in all layers have been calculated
                '''''''''''' to update all weights
                inputs = vec                             ' input is to be used to update the first layer
                For i = 1 To layercount - 1             ' update weights for each hidden layer
                    layerSize = UBound(.hLayer(i).units)
                    For j = 1 To layerSize              ' for each units in the layer, updateunits with Ate*delta* input
                        tempIn = inputs
                        b = layerDelta(i).d(j) * Ate    ' update the bias value
                        If m > 0 Then
                            updateUnit deltaW.hLayer(i).units(j), tempIn, 1, b
                            updateUnit deltaW.hLayer(i).units(j), previousDeltaW.hLayer(i).units(j).w, previousDeltaW.hLayer(i).units(j).b, m
                            .hLayer(i).units(j) = updateUnit(.hLayer(i).units(j), previousDeltaW.hLayer(i).units(j).w, previousDeltaW.hLayer(i).units(j).b)
                        End If
                        If m = 0 Then .hLayer(i).units(j) = updateUnit(.hLayer(i).units(j), tempIn, 1, b) ' b as ratio of updating
                            ' to save time, temp net is not used if momentum =0
                    Next
                    inputs = output(i).d
                Next
                layerSize = UBound(.OutLayer.units) ' prepare the outlayer and update weights for each out units
                For j = 1 To layerSize   ' for each units in the outlayer, updateunits with Ate*delta* input
                    tempIn = inputs
                    b = layerDelta(layercount).d(j) * Ate
                    If m > 0 Then
                        updateUnit deltaW.OutLayer.units(j), tempIn, 1, b
                        updateUnit deltaW.OutLayer.units(j), previousDeltaW.OutLayer.units(j).w, previousDeltaW.OutLayer.units(j).b, m
                        .OutLayer.units(j) = updateUnit(.OutLayer.units(j), deltaW.OutLayer.units(j).w, deltaW.OutLayer.units(j).b)
                    End If
                    If m = 0 Then .OutLayer.units(j) = updateUnit(.OutLayer.units(j), tempIn, 1, b)
                Next
                ' update of all weights done, copy current delta weights to previous delta weights for next round updates
                If m > 0 Then previousDeltaW = deltaW
            End If
        Else
            MsgBox "labelsize does not equal to outsize"
        End If
    End With
    NNBP = ErrorValue
End Function

随机梯度下降算法和标准梯度下降算法

上面的函数首先调用一次前向传播NNFP()计算网络的输出，然后计算输出和目标值（也就是”正确答案“）之间的差异，并计算导数。逐级传递到第一层。即算完成后，逐级更新所有单元的连接权值。完成一次权值更新。

上面的函数在每次更新权值之前只进行一次前向传播，也就是说，只进行一组数据的学习，检查到误差值后，就立即更新权值，这种方法被称为“Stochastic Gradient Descent"随机梯度下降算法，很多情况下一组数据可能覆盖面太窄，因此可以让网络对比如说100组数据进行学习，计算这一百组数据的实际输出和目标值之间的平均误差，并用这个平均误差来更新权值。这种做法是“Standard Gradient Descent”标准梯度下降算法。这种算法在NNBP2()函数中体现：

Public Function NNBP2(net As NN, vset() As MultiArr, tgtset() As MultiArr, Ate As Double, Optional m As Double = 0) As Double
' This function provides one epoch of standard gradiant decenant algorithm that calculata delta value of weights base on _
    the whole test vector set and updates weights at the summed delta weights
' This function returns total error value before NNBP2 epoch
' vset() is a multi array that contains multiple input vectors in multiple rows, one vector in each item
' tgtset() is a multi array that contains targets of input vector sets, one target() in each item
' net is the network to be trained, Ate is learning speed
' M is the parameter momentum
' the essential of batch or standard gradiant decendant is to repeat back propagation on the same _
    net work and sum all delta weights, and then updates all weights in one time. thus net0 will be initialized as an "all 0" _
    and receives all the weights update during NNBP of the net, and the net will be updated after all samples are calculated
Dim delta() As Double, previousDelta() As Double, vec() As Double
Dim output() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double, target() As Double
Dim b As Double, num As Double, ErrorValue As Double, totalError As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, sampleCount As Long
Dim layerSize As Long, layercount As Long, curSample As Long
Dim preLayer As layer
Dim curLayer As layer
Dim layerDelta() As MultiArr ' stores all delta values in a group of deltas
Dim curDelta As MultiArr
Dim preDelta As MultiArr   ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, utf As Byte
Dim net0 As NN

    layercount = getLayerCount(net)
    totalError = 0                      ' set starting value of total error
    ReDim layerDelta(1 To layercount)   ' set the layer delta structure
    net0 = allZeroNN                    ' set net0 to be an "all 0" net work to receive sum of weights
    With net
        outSize = UBound(.OutLayer.units)
        sampleCount = UBound(vset)
        labelSize = targetSampleSize
        vecSize = .inSize
        If labelSize = outSize And vecSize = inputSampleSize Then     ' checks if the size of label vector is the same as size of output vector
            For curSample = 1 To sampleCount   ' calculate the delta value for all samples before updating weights
                '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                '' start to calculate the delta value of outlayer
                ErrorValue = 0                      ' set starting value of errorvalue
                vec = InputData(curSample).d        ' read the input vec from input vec set
                target = targetData(curSample).d    'read the target from target set
                ReDim curDelta.d(1 To labelSize)
                utf = .OutLayer.utf
                NNFP net, vec, output        ' calculate the outputs of net
                For i = 1 To outSize            ' calculates the error of output by calculating value of target - output for each output unit
                    out = output(.hlayercount + 1).d(i)
                    tar = target(i)
                    num = (tar - out)
                    If curSample > sampleCount - validationSampleCount Then _
                        ErrorValue = ErrorValue + num * num    ' error value is recorded only for validation samples
                    curDelta.d(i) = num * dUTFunction(utf, out)      ' delta is calculated via dfunction of utf
                Next
                ErrorValue = ErrorValue / 2
                layerDelta(layercount) = curDelta
                If layercount > 1 Then          ' there are hidden layers, delta should be calculated layer by layer
                
                    '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                    '' start to calculate the delta value of hidden layers
                    preLayer = .OutLayer                    ''''' let prelayer = outlayer
                                                            'to calculate delta of current layer, the whole previous layer should be used to calculate
                    preDelta = layerDelta(layercount)       ''''' let predelta = delta of outlayer
                    '''''''''''''''''''''
                    For i = layercount - 1 To 1 Step -1     ' calculate from the last hidden layer back to the first hidden layer
                        'preDelta = curDelta                'to calculate delta of current layer, that of previous layer is required
                        curLayer = .hLayer(i)
                        utf = curLayer.utf
                        layerSize = UBound(curLayer.units)
                        ReDim curDelta.d(1 To layerSize)
                        For j = 1 To layerSize              ' calculate delta for all units in curlayer based on previous layer
                            cur = 0                         '  prepare to calculate the sum product
                            For k = 1 To UBound(preDelta.d) ' delta for each units is based on the sum of product of predelta and weight of the unit
                                cur = cur + preDelta.d(k) * preLayer.units(k).w(j)
                            Next k
                            out = output(i).d(j)
                            num = cur * dUTFunction(utf, out) ' delta value for each level is determined by the d function of utf of each layer
                            curDelta.d(j) = num
                        Next j
                        preLayer = curLayer
                        preDelta = curDelta
                        layerDelta(i) = curDelta
                    Next i
                End If
                '''''''''''' delta calculated from one sample is completed
                '''''''''''' a temperary weight array will be updated to sum all weight change
                inputs = vec                            ' input is to be used to update the first layer
                For i = 1 To layercount - 1             ' update weights for each hidden layer
                    layerSize = UBound(.hLayer(i).units)
                    For j = 1 To layerSize              ' for each units in the layer, updateunits with Ate*delta* input
                        tempIn = inputs
                        For k = 1 To UBound(inputs)
                            num = tempIn(k) * layerDelta(i).d(j) * Ate
                            tempIn(k) = num
                        Next
                        b = layerDelta(i).d(j) * Ate    ' update the bias value
                        net0.hLayer(i).units(j) = updateUnit(net0.hLayer(i).units(j), tempIn, b) 'in NNBP2, net0 is updated for each sample
                    Next
                    ReDim inputs(1 To layerSize)
                    ReDim tempIn(1 To layerSize)
                    inputs = output(i).d
                Next
                layerSize = UBound(.OutLayer.units) ' prepare the outlayer and update weights for each out units
                For j = 1 To layerSize   ' for each units in the outlayer, updateunits with Ate*delta* input
                    tempIn = inputs
                    For k = 1 To UBound(inputs)
                        num = tempIn(k) * layerDelta(layercount).d(j) * Ate  ' get the delta * input for the out layer
                        tempIn(k) = num
                    Next
                    b = layerDelta(layercount).d(j) * Ate
                    net0.OutLayer.units(j) = updateUnit(net0.OutLayer.units(j), tempIn, b)  'in NNBP2, net0 is updated for each sample
                Next
                totalError = totalError + ErrorValue
            Next curSample
            '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
            'complete of summing weights into net0, next step: add net0 weights to net
            '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
            For i = 1 To layercount - 1 ' add all hiddenlayers
                layerSize = UBound(.hLayer(i).units)
                For j = 1 To layerSize
                    updateUnit .hLayer(i).units(j), net0.hLayer(i).units(j).w, net0.hLayer(i).units(j).b
                    If m > 0 Then updateUnit .hLayer(i).units(j), previousDeltaW.hLayer(i).units(j).w, _
                        previousDeltaW.hLayer(i).units(j).b, m
                        ' update one more time according to previous update of weights
                Next j
            Next i
            layerSize = UBound(.OutLayer.units)
            For j = 1 To layerSize
                updateUnit .OutLayer.units(j), net0.OutLayer.units(j).w, net0.OutLayer.units(j).b
                If m > 0 Then updateUnit .OutLayer.units(j), previousDeltaW.OutLayer.units(j).w, _
                    previousDeltaW.OutLayer.units(j).b, m
                    ' update one more time according to previous update of weights
            Next
            previousDeltaW = net0        ' let previous delta weights which will be used in next iterate
        Else
            MsgBox "labelsize does not equal to outsize"
        End If
    End With
    If sampleCount > validationSampleCount Then sampleCount = validationSampleCount
    NNBP2 = totalError / sampleCount
End Function

Levenberg Marquardt算法

随机梯度下降和梯度下降算法都是所谓爬山法的变体。两种算法的特点都是“小步快跑”，需要对网络的权值进行反复的调整，每次调整一点点（通常学习速率在0.02～0.1之间），因此可能需要很长的时间才能实现网络的收敛。而在早期计算性能还不够的时候，人们开发出了几种更加有效的算法，能够极大地提升学习效率，Levenberg Marquardt就是其中一种。在网络不算太大的时候，LM算法可能只需要2～5个回合就能达到最佳精度的收敛，而梯度下降法可能需要1000个回合以上。

不过这并不能说明LM是万能的。它虽然需要的训练回合数少，但是每一个回合的计算复杂度相当高，在网络规模较小的时候，能够有几十倍到几百倍的效率提升，但是只要网络复杂度提升，它的效率优势很快就会被抵消。

LM算法的实现在NNLM()和PrepLM()这两个函数中：

PrepLM()用于计算LM算法所需要的几个关键中间变量。可以看到这里的中间变量都适用MultiArr数据结构

Public Function prepLM(net As NN, vset() As MultiArr) As Double
' prepare triple array matrices that are used for NNLM training and initialize them before training
Dim layerSize As Long, layercount As Long, unitSize As Long, sampleCount As Long
Dim i As Long, j As Long

    With net
        layercount = getLayerCount(net)
        sampleCount = trainSampleCount + validationSampleCount
        
        ReDim Jacob(1 To layercount)
        ReDim A(1 To layercount)
        ReDim grad(1 To layercount)
        ReDim hlm(1 To layercount)
        ReDim fw(1 To layercount)
        ReDim fwnew(1 To layercount)
        ReDim id(1 To layercount)
        'initialize all multiarrs
        ReDim lamda(1 To layercount)
        ReDim v(1 To layercount)
        ReDim r(1 To layercount)
        ReDim derive(1 To layercount)
        For i = 1 To layercount
            ' redim Jacobians and JJs layer by layer
            If i < layercount Then  ' initialize J and JJ for hidden layers
                layerSize = UBound(.hLayer(i).units)  ' unitcount = number of units
            Else                    ' initialize J and JJ for outlayer
                layerSize = UBound(.OutLayer.units)  ' unitcount = number of units
            End If
            ReDim Jacob(i).m(1 To layerSize)
            ReDim A(i).m(1 To layerSize)
            ReDim grad(i).m(1 To layerSize)
            ReDim hlm(i).m(1 To layerSize)
            ReDim fw(i).m(1 To layerSize)
            ReDim fwnew(i).m(1 To layerSize)
            ReDim id(i).m(1 To layerSize)
            
            ReDim lamda(i).d(1 To layerSize)
            ReDim v(i).d(1 To layerSize)
            ReDim r(i).d(1 To layerSize)
            ReDim derive(i).d(1 To layerSize)
            For j = 1 To layerSize    ' initialize Jacobian and JJ for each unit
                If i < layercount Then
                    unitSize = .hLayer(i).unitSize1
                Else
                    unitSize = .OutLayer.unitSize1
                End If
                ' 2 dimensional array to store Jacobian, delta w are stored in rows, _
                    b(0, n) is used for bias value also as w0 in order to facilitate array multiplication
                ReDim Jacob(i).m(j).d(1 To sampleCount, 0 To unitSize)
                ReDim A(i).m(j).d(0 To unitSize, 0 To unitSize)   ' JJ as result of J(T) * J is smaller in size
                ReDim fw(i).m(j).d(1 To sampleCount)
                ReDim fwnew(i).m(j).d(1 To sampleCount)
                ReDim grad(i).m(j).d(0 To unitSize)      ' grad is a vector for each unit
                ReDim hlm(i).m(j).d(0 To unitSize)
                ReDim id(i).m(j).d(0 To unitSize, 0 To unitSize)
                
                id(i).m(j).d = mIdentity(0, unitSize)         'initialize identity matrix
                lamda(i).d(j) = 0.1                         'initializa all lamda
                v(i).d(j) = 2
                r(i).d(j) = 0
                derive(i).d(j) = 0
            Next
        Next
    End With
End Function

Public Function NNLM(net As NN, vset() As MultiArr, tgtset() As MultiArr) As Double
' This function provides one epoch of neural network training with Levenberg Marquardt algorithm
' This function returns total error value of the NN output, the total error will be used to decide when training stops
' vset() is a multi array that contains multiple input vectors in multiple rows, one vector in each item
' tgtset() is a multi array that contains targets of input vector sets, one target() in each item
' net is the network whose weights are adjusted according to the data and its error
' lamda is the damping parameter used in the function to define next updates of weights, and is adjusted according to _
    the output of updated net. Parameter Lamda is passed in byRef, thus this function changes the value of Lamda _
    directly and updated Lamda will be used in the next iteration or epoch. Lamda is defined as a module level variable
' the Levenberg Marquardts algorithms uses the approximation of second dirivtives of the mean square error when possible, _
    combined with gradient descendant algorithm normally provides much faster speed of convergence and much less epochs. the _
    LM algorithm is among the most efficient algorithms of nerwork training

Dim delta() As Double, vec() As Double
Dim output() As MultiArr, output0() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double, target() As Double
Dim b As Double, num As Double, ErrorValue As Double, totalError As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, sampleCount As Long
Dim layerSize As Long, layercount As Long, curSample As Long, inSize As Long
Dim preLayer As layer, aInv As Variant
Dim curLayer As layer
'Dim layerDelta() As MultiArr    ' all delta values in a group of deltas
Dim curDelta As MultiArr
Dim preDelta As MultiArr        ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, utf As Byte
Dim net0 As NN

    layercount = getLayerCount(net)
    totalError = 0                      ' set starting value of total error
    ReDim layerDelta(1 To layercount)   ' set the layer delta structure
    net0 = net                          ' next new net is updated based on current net
    
    With net
    totalError = NNGrad(net, grad, Jacob, fw)
    
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        'complete of calculating of all Jacobian for units
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        ' section 3, calculate of hlm
        For i = 1 To layercount
            If i < layercount Then
                layerSize = UBound(.hLayer(i).units)
            Else
                layerSize = UBound(.OutLayer.units)
            End If
            For j = 1 To layerSize
                ' to calculate JJ for layer(i) unit(j)
                A(i).m(j).d = mMulti(Jacob(i).m(j).d, Jacob(i).m(j).d, True)        ' get J(T) * J
                A(i).m(j).d = mAdd(A(i).m(j).d, id(i).m(j).d, , lamda(i).d(j))      ' get J(T) * J + Lamda * Id
                A(i).m(j).d = mInv(A(i).m(j).d)                                    ' get (J(T) * J + Lamda * Id) ^ -1
                grad(i).m(j).d = mMulti(Jacob(i).m(j).d, fw(i).m(j).d, True, , , 1)     ' get J(T) * fw
                hlm(i).m(j).d = mMulti(A(i).m(j).d, grad(i).m(j).d, , , , 1)
                    ' calculate the result of (J(T) * J + Lamda * I)^-1 * J(T) * grad
               
                ' update the net for result testing
                If i < layercount Then
                    updateUnit net0.hLayer(i).units(j), hlm(i).m(j).d, hlm(i).m(j).d(0)
                Else
                    updateUnit net0.OutLayer.units(j), hlm(i).m(j).d, hlm(i).m(j).d(0)
                End If
                
            Next j
        Next i
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        ' Section 4: calculate New f(w) and F(w) with updated net0
        
        NNFw net0, fwnew
        
        '''''''''''''''''''''''''''''''''''''''''
        ' Section 5: calculate the gain ratio q value: q = F(w)-F(w new) / 0.5 * hlm (lamda * hlm - grad)
            ' gain ratio is defined as the ratio between actual F(w) change to estimated change
            ' calculate F(w) and F(w new), in which F(w) = 0.5 * f(w)T * f(w)
            
        For i = 1 To layercount ' calculate F(w)-F(w new) layer by layer
            If i < layercount Then
                layerSize = UBound(.hLayer(i).units)
            Else
                layerSize = UBound(.OutLayer.units)
            End If
            For j = 1 To layerSize
                ' calculate estimated ratio change
                grad(i).m(j).d = mAdd(hlm(i).m(j).d, grad(i).m(j).d, lamda(i).d(j), -1, 1)
                num = mMulti(hlm(i).m(j).d, grad(i).m(j).d, True, , 1, 1)
                num = (mMulti(fw(i).m(j).d, fw(i).m(j).d, True, , 1, 1) - _
                    mMulti(fwnew(i).m(j).d, fwnew(i).m(j).d, True, , 1, 1)) / -num
                ' gain ratio = Fw - Fwnew / 0.5 * hlm ... => 0.5* fwT*fw - 0.5*fwnewT*fwnew / 0.5 * hlm ...
                ' => 2 * (0.5*fwT*fw - 0.5*fwnewT*fwnew) / hlm ... => (fwT*fw - fwnewT*fwnew) / hlm ...
                r(i).d(j) = num
                ' update lamda and net according to gain ratio
                If num > 0 Then
                    ' accept the step and update current unit of Net and Lamda
                    If i < layercount Then
                        .hLayer(i).units(j) = net0.hLayer(i).units(j)
                    Else
                        .OutLayer.units(j) = net0.OutLayer.units(j)
                    End If
                    ' update current lamda
                    If num > 10 Then num = 10
                    lamda(i).d(j) = lamda(i).d(j) * max(0.33333, 1 - (2 * num - 1) ^ 3)
                    v(i).d(j) = 2
                Else
                    ' current unit of net will NOT be updated, step Discarded, update Lamda
                    If lamda(i).d(j) < 10000 Then lamda(i).d(j) = lamda(i).d(j) * v(i).d(j)
                    If v(i).d(j) < 1000 Then v(i).d(j) = v(i).d(j) * 2
                End If
            Next
        Next
        ' all units of nn are updated, calculate latest total error
    End With
    NNLM = totalError / validationSampleCount
End Function

关于Levenberg Marquardt算法的详细介绍，可以查阅更多资料：

SCG算法

Scaled Conjugate Gradient比例共轭梯度法是另一种提高神经网络训练效率的算法。SCG算法的实现在PrepCG和NNCG两个函数中。
SCG算法在某些情况下性能接近LM，能够在15～30个回合内实现收敛，不过其优点在于每一个回合的计算开销相对LM来说较低。

Public Function prepCG(net As NN, vset() As MultiArr, tgtset() As MultiArr) As Double
' prepare variables needed for scaled conjugate gradient training, and calculate _
    gradient for the first iterate

Dim delta() As Double, vec() As Double, net0 As NN
Dim output() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double, target() As Double
Dim b As Double, num As Double, ErrorValue As Double, totalError As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, sampleCount As Long
Dim layerSize As Long, layercount As Long, curSample As Long, inSize As Long
Dim preLayer As layer
Dim curLayer As layer
'Dim layerDelta() As MultiArr    ' all delta values in a group of deltas
Dim curDelta As MultiArr
Dim preDelta As MultiArr        ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, utf As Byte

Dim unitSize As Long

    With net
        layercount = getLayerCount(net)
        sampleCount = trainSampleCount + validationSampleCount
        
        ReDim vectorS(1 To layercount)
        ReDim vectorP(1 To layercount)
        ReDim vectorR(1 To layercount)
        ReDim NewVectorR(1 To layercount)
        ReDim Jacob(1 To layercount)
        ReDim fw(1 To layercount)
        ReDim fwnew(1 To layercount)
        ReDim grad(1 To layercount)
        ReDim SCGSuccess(1 To layercount)
        ReDim SCGFound(1 To layercount)
        'initialize all multiarrs
        ReDim lamda(1 To layercount)
        ReDim lamdaBar(1 To layercount)
        ReDim sigma(1 To layercount)
        ReDim alpha(1 To layercount)
        ReDim scalarMu(1 To layercount)
        ReDim scalarDelta(1 To layercount)
        ReDim scalarDeltaC(1 To layercount)
        ReDim derive(1 To layercount)
        For i = 1 To layercount
            ' redim Jacobians and JJs layer by layer
            If i < layercount Then  ' initialize J and JJ for hidden layers
                layerSize = UBound(.hLayer(i).units)  ' unitcount = number of units
            Else                    ' initialize J and JJ for outlayer
                layerSize = UBound(.OutLayer.units)  ' unitcount = number of units
            End If
            ReDim vectorS(i).m(1 To layerSize)
            ReDim vectorP(i).m(1 To layerSize)
            ReDim vectorR(i).m(1 To layerSize)
            ReDim NewVectorR(i).m(1 To layerSize)
            ReDim Jacob(i).m(1 To layerSize)
            ReDim fw(i).m(1 To layerSize)
            ReDim fwnew(i).m(1 To layerSize)
            ReDim grad(i).m(1 To layerSize)
            ReDim SCGSuccess(i).d(1 To layerSize)
            ReDim SCGFound(i).d(1 To layerSize)
            ReDim lamda(i).d(1 To layerSize)
            ReDim lamdaBar(i).d(1 To layerSize)
            ReDim sigma(i).d(1 To layerSize)
            ReDim alpha(i).d(1 To layerSize)
            ReDim scalarMu(i).d(1 To layerSize)
            ReDim scalarDelta(i).d(1 To layerSize)
            ReDim scalarDeltaC(i).d(1 To layerSize)
            ReDim derive(i).d(1 To layerSize)
            For j = 1 To layerSize    ' initialize Jacobian and JJ for each unit
                If i < layercount Then
                    unitSize = .hLayer(i).unitSize1
                Else
                    unitSize = .OutLayer.unitSize1
                End If
                ' 2 dimensional array to store Jacobian, delta w are stored in rows, _
                    b(0, n) is used for bias value also as w0 in order to facilitate array multiplication
                ReDim vectorS(i).m(j).d(0 To unitSize)
                ReDim vectorP(i).m(j).d(0 To unitSize)
                ReDim vectorR(i).m(j).d(0 To unitSize)
                ReDim NewVectorR(i).m(j).d(0 To unitSize)
                ReDim Jacob(i).m(j).d(1 To sampleCount, 0 To unitSize)
                ReDim fw(i).m(j).d(1 To sampleCount)
                ReDim fwnew(i).m(j).d(1 To sampleCount)
                ReDim grad(i).m(j).d(0 To unitSize)
                ' initial all scalars
                lamda(i).d(j) = 0.5
                lamdaBar(i).d(j) = 0
                sigma(i).d(j) = 0.01
                SCGSuccess(i).d(j) = 1      ' success = True : 1 / False : 0, initially set success factor = true
                SCGFound(i).d(j) = 1        ' found = gradient of the unit, = 0 then target found
                scalarMu(i).d(j) = 0
                'scalarDelta(i).d(j) = 0
                'scalarDeltaC(i).d(j) = 0
            Next
        Next
    End With
    
    ' section 2, calculate grad for the initial net to facilitate the algorithm
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    NNGrad net, vectorP, Jacob, fw
    vectorR = vectorP
    
End Function

Public Function NNCG(net As NN, vset() As MultiArr, tgtset() As MultiArr, epochs As Long) As Double
' This function provides one epoch of neural network training with Scaled Conjugated Gradient algorithm
' This function returns total error value (Mean Square Error) before the adjustment of weights
' vset() is a multi array that contains multiple input vectors in multiple rows, one vector in each item
' tgtset() is a multi array that contains targets of input vector sets, one target() in each item
' net is the network whose weights are adjusted according to the data and its error
' epochs is the current iteration epoch for SCG algorithm to determine if the conjugate vector should be recalculated
' the Conjugated gradient descendant algorithm adjusts the weights of network using second derivitives of the cost function _
    so that allows the convergence to a local minimizer much faster than standard and stochastic gradient descendant algorithms

    ' codes for conjugated gradient descendant training
Const initSigma As Double = 0.01

Dim delta() As Double, vec() As Double, pp As Double
Dim output() As MultiArr, output0() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double, target() As Double, success As Byte
Dim b As Double, num As Double, ErrorValue As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, sampleCount As Long
Dim layerSize As Long, layercount As Long, curSample As Long, inSize As Long
Dim preLayer As layer
Dim curLayer As layer
'Dim layerDelta() As MultiArr    ' all delta values in a group of deltas
Dim curDelta As MultiArr
Dim preDelta As MultiArr        ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, utf As Byte
Dim net0 As NN

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'' section 0, the gradient for first epoch is calculated already in prepCG()
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'' section 1, if success then update net0 to calculate the second order information
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    layercount = getLayerCount(net)
    ErrorValue = 0                      ' set starting value of total error
    net0 = net
    With net0
        outSize = UBound(.OutLayer.units)
        sampleCount = trainSampleCount + validationSampleCount
        labelSize = targetSampleSize
        vecSize = .inSize
        success = 0         ' identify if any unit is successfuly updated
        For i = 1 To layercount
            If i < layercount Then
                layerSize = UBound(.hLayer(i).units)
            Else
                layerSize = outSize
            End If
            For j = 1 To layerSize
                ' calculate sigma
                If SCGFound(i).d(j) > 1E-100 Then
                    If SCGSuccess(i).d(j) = 1 Then      ' new net is updated only if success = true
                        sigma(i).d(j) = initSigma / Sqr(mMulti(vectorP(i).m(j).d, vectorP(i).m(j).d, True, , 1, 1))
                        'update unit
                        success = 1                     ' one of the unit is successfuly updated, second order information can be calc
                        If i < layercount Then
                            updateUnit .hLayer(i).units(j), vectorP(i).m(j).d, vectorP(i).m(j).d(0), sigma(i).d(j)
                        Else
                            updateUnit .OutLayer.units(j), vectorP(i).m(j).d, vectorP(i).m(j).d(0), sigma(i).d(j)
                        End If
                    End If
                End If
            Next
        Next
    End With
    
    If success = 1 Then
    
        NNGrad net0, grad, Jacob, fwnew
    
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        ' section 3, calculate second order information: sigma(k) = sigma / grad(T) * grad and update new weights
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        For i = 1 To layercount
            With net
                If i < layercount Then
                    curLayer = .hLayer(i)
                Else
                    curLayer = .OutLayer
                End If
            End With
            layerSize = UBound(curLayer.units)
            For j = 1 To layerSize
                ' to calculate grad for layer(i) unit(j)
                If SCGFound(i).d(j) > 1E-100 Then
                    vectorS(i).m(j).d = mAdd(grad(i).m(j).d, vectorR(i).m(j).d, 1 / sigma(i).d(j), -1 / sigma(i).d(j), 1)
                    scalarDelta(i).d(j) = mMulti(vectorP(i).m(j).d, vectorS(i).m(j).d, True, , 1, 1)
                    num = scalarDelta(i).d(j)
                End If
            Next
        Next
    End If
    
    net0 = net
    
    With net0
        For i = 1 To layercount
            If i < layercount Then
                curLayer = .hLayer(i)
            Else
                curLayer = .OutLayer
            End If
            layerSize = UBound(curLayer.units)
            For j = 1 To layerSize
                If SCGFound(i).d(j) > 1E-100 Then
                    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                    'section 3.1: find optimal eta with line search method
                    ' start line search'
                    pp = mMulti(vectorP(i).m(j).d, vectorP(i).m(j).d, True, , 1, 1)
                    vectorS(i).m(j).d = mAdd(vectorS(i).m(j).d, vectorP(i).m(j).d, _
                        , lamda(i).d(j) - lamdaBar(i).d(j), 1) 'calculate S
                    scalarDelta(i).d(j) = scalarDelta(i).d(j) + (lamda(i).d(j) - lamdaBar(i).d(j)) * pp
                    scalarDelta(i).d(j) = scalarDelta(i).d(j)
                    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                    'section 3.2: if scalardelta is smaller than 0 then make it positive definite
                    If scalarDelta(i).d(j) <= 0 Then
                        num = 2 * scalarDelta(i).d(j) / pp
                        vectorS(i).m(j).d = mAdd(vectorS(i).m(j).d, vectorP(i).m(j).d, , lamda(i).d(j) - num, 1)
                        lamdaBar(i).d(j) = 2 * lamda(i).d(j) - num
                        scalarDelta(i).d(j) = lamda(i).d(j) * pp - scalarDelta(i).d(j)
                        scalarDelta(i).d(j) = scalarDelta(i).d(j)
                    End If
                    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                    'section 3.3: calculate step size eta: alpha = pT*r/delta mind that p*r <> p*p
                    scalarMu(i).d(j) = mMulti(vectorP(i).m(j).d, vectorR(i).m(j).d, True, , 1, 1)
                    alpha(i).d(j) = scalarMu(i).d(j) / scalarDelta(i).d(j)
                    alpha(i).d(j) = alpha(i).d(j)
                    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
                    'section 3.4: update weights with calculated step size to prepare for comparison
                    
                    If i < layercount Then
                        updateUnit .hLayer(i).units(j), vectorP(i).m(j).d, vectorP(i).m(j).d(0), alpha(i).d(j)
                    Else
                        updateUnit .OutLayer.units(j), vectorP(i).m(j).d, vectorP(i).m(j).d(0), alpha(i).d(j)
                    End If
                End If
            Next j
        Next i
    End With
    '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ' Section 4: Calculate E(w new) and then calculate Sk = E(w) - E(wnew)
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    NNFw net0, fwnew
    
    With net
        For i = 1 To layercount ' calculate F(w)-F(w new) layer by layer
            If i < layercount Then
                curLayer = .hLayer(i)
            Else
                curLayer = .OutLayer
            End If
                layerSize = UBound(curLayer.units)
    
            For j = 1 To layerSize
                If SCGFound(i).d(j) > 1E-100 Then
                    ' calculate estimated comparison parameters
                    num = mMulti(fwnew(i).m(j).d, fwnew(i).m(j).d, True, , 1, 1)
                    num = mMulti(fw(i).m(j).d, fw(i).m(j).d, True, , 1, 1) - num
                    num = 2 * scalarDelta(i).d(j) * num
                    num = num / scalarMu(i).d(j) ^ 2
                    num = num
                    If num >= 0 Then
                        ' accept the step and update current unit of Net and Lamda
                        If i < layercount Then
                            .hLayer(i).units(j) = net0.hLayer(i).units(j)
                        Else
                            .OutLayer.units(j) = net0.OutLayer.units(j)
                        End If
                        ' update current lamda
                        lamdaBar(i).d(j) = 0
                        SCGSuccess(i).d(j) = 1      ' success = true
                        
                        If num >= 0.75 Then lamda(i).d(j) = lamda(i).d(j) / 2
                        'the algorithm will be restarted if k mod N = zero and this is a parameter _
                            passed from calling sub process
                    ElseIf num < 0 Then
                        ' current unit of net will NOT be updated, step Discarded, update Lamda
                        lamdaBar(i).d(j) = lamda(i).d(j)
                        SCGSuccess(i).d(j) = 0      ' success = false
                    ElseIf num < 0.25 Then lamda(i).d(j) = lamda(i).d(j) * 4
                    End If
                End If
            Next
        Next
    End With
        
    NNCG = NNGrad(net, NewVectorR, Jacob, fw)       ' get gradient of net for next iteration
    
    For i = 1 To layercount ' calculate F(w)-F(w new) layer by layer
        If i < layercount Then
            curLayer = net.hLayer(i)
        Else
            curLayer = net.OutLayer
        End If
            layerSize = UBound(curLayer.units)

        For j = 1 To layerSize
            vectorR(i).m(j).d = NewVectorR(i).m(j).d
            SCGFound(i).d(j) = mMulti(vectorR(i).m(j).d, vectorR(i).m(j).d, True, , 1, 1)
            If epochs Mod (curLayer.unitSize1 + 1) = 0 Then
            ' restart algorithm, reset start direction P, N is different layer by layer
                vectorP(i).m(j).d = vectorR(i).m(j).d
            Else                ' calculate next conjugate direction P with factor beta
                'calculate beta and beta = num
                num = mMulti(NewVectorR(i).m(j).d, NewVectorR(i).m(j).d, True, , 1, 1)
                num = num - mMulti(NewVectorR(i).m(j).d, vectorR(i).m(j).d, True, , 1, 1)
                num = num / scalarMu(i).d(j)
                vectorP(i).m(j).d = mAdd(vectorR(i).m(j).d, vectorP(i).m(j).d, , num, 1)
            End If
        Next
    Next

End Function

卷积网络的前向和反向传播算法

在ANN Toolbox中针对CNN仅实现了最简单的FP前向传播和反向传播算法，这部分算法放在CNNFP()和CNNBP()两个函数中。不过，因为CNN的复杂性，VBA的计算速度已经跟不上了，对于超过MNIST手写数字规模的CNN来说，训练时间已经非常长了。

Public Function CNNFP(net As NN, vec() As Double, output() As MultiArr) As Long
' get feed forward calculation result of a convolutional neural network
' vec() is the input vector, vec() is a 3-Dimensional array, representing 1/2-D data with one or multi channel
' output result is stored in a Multiarray
' the function returns classification of output data if output is a class type

Dim inRows As Long, inCols As Long, inChannels As Long, kRows As Long, kCols As Long
Dim i As Long, j As Long, k As Long, h As Long, m As Long, n As Long, o As Long, ii As Long, jj As Long
Dim curLayer As layer, utf As Byte, curUnit As Long
Dim num As Double, num2 As Double

    inRows = UBound(vec, 1)
    inCols = UBound(vec, 2)
    inChannels = UBound(vec, 3)
        
    With net
    '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ' section 1, prepare output structure
    ' output is defined as a multi array that contains a 3D array for each layer
        n = .hlayercount + 1
        ReDim output(0 To n)    'output(0) stores input data
        'ReDim output(0).d(1 To inRows, 1 To inCols, 1 To inChannels)
        
        For i = 1 To n
            If i < n Then
                curLayer = .hLayer(i)
            Else
                curLayer = .OutLayer
            End If
            With curLayer
                ReDim output(i).d(1 To .outSize1, 1 To .outSize2, 1 To .outSize3)
                    ' the outputs of all CNN layers are 3-D
            End With
        Next
        
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        ' output structure defined, transfer inputdata to output(0)
        ' invec is a multi dimensional array and output(0) is a multi array, value has to be transfered iteratively
        ' input data of CNN is always a 3-D array
        output(0).d = vec
        
        ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        ' start to calculate layer by layer the output according to type of each layer
        
        For h = 1 To n ' n = .hlayercount + 1
            If h < n Then
                curLayer = .hLayer(h)
            Else
                curLayer = .OutLayer
            End If
            num = 0
            With curLayer
                utf = .utf
                inRows = UBound(output(h - 1).d, 1)
                inCols = UBound(output(h - 1).d, 2)
                inChannels = UBound(output(h - 1).d, 3)
                Select Case .layerType
                Case NNFullConLayer
                    For curUnit = 1 To UBound(.units)
                        For k = 1 To inChannels           ' for each channel
                            For i = 1 To inRows   ' for each item in the channel (input is 2-D)
                                For j = 1 To inCols
                                    num = num + output(h - 1).d(i, j, k) * .units(curUnit).w(i, j, k)
                                Next
                            Next
                        Next
                        num = num + .units(curUnit).b
                        num = UTFunction(utf, num)
                        output(h).d(curUnit, 1, 1) = num    ' output of all layers are 3-D, but fullcon layer will only use _
                            the first dimension
                        num = 0
                    Next curUnit
                Case NNConvLayer        '
                    For curUnit = 1 To UBound(.units)
                        kRows = .unitSize1
                        kCols = .unitSize2      ' kchannel = unitsize3 = inchannels
                        
                        ' IMPROVEMENT is POSSIBLE
                        ' next section can be put in the mConv function
                        ' NO! separating in mConv function is NOT possible, because the output of convolution of two arrays is a _
                            2-D array, and this 2_D array can not be directly copied to one channel of a 3D array
                        For i = 1 To inRows - kRows + 1 ' for each item in the channel (input is 2-D)
                            For j = 1 To inCols - kCols + 1
                                For k = 1 To inChannels
                                    For ii = 0 To kRows - 1
                                        For jj = 0 To kCols - 1
                                            num = num + output(h - 1).d(i + ii, j + jj, k) * .units(curUnit).w(ii + 1, jj + 1, k)
                                        Next jj
                                    Next ii
                                Next k
                                num = num + .units(curUnit).b
                                num = UTFunction(utf, num)  ' * activation function of ReLU or Sigmoid is needed in CNN
                                output(h).d(i, j, curUnit) = num
                                num = 0
                            Next j
                        Next i
                    Next curUnit
                Case NNPoolLayer        ' there's only one unit which is always empty
                    kRows = .unitSize1       ' pooling rate on rows and columns of input map
                    kCols = .unitSize2       ' kchannel = unitsize3 = inchannels
                    For i = 1 To inRows Step kRows  ' for every kRows X kCols block in the input array
                        For j = 1 To inCols Step kCols
                            For k = 1 To inChannels
                                Select Case .utf
                                Case UTFMaxP    ' this is the definition of max pool
                                    For ii = 0 To kRows - 1
                                        For jj = 0 To kCols - 1
                                            num2 = output(h - 1).d(i + ii, j + jj, k)
                                            If num < num2 Then num = num2
                                        Next jj
                                    Next ii
                                Case UTFAvgP    ' this is the definition of average pooling
                                    For ii = 0 To kRows - 1
                                        For jj = 0 To kCols - 1
                                            num2 = num2 + output(h - 1).d(i + ii, j + jj, k)
                                        Next jj
                                    Next ii
                                    num = num2 / kRows * kCols
                                End Select
                                output(h).d((i + 1) / kRows, (j + 1) / kCols, k) = num ' activation function is not needed
                                num = 0
                                num2 = 0
                            Next k
                        Next j
                    Next i
                End Select
            End With
        Next h
        j = 1
        ' get the category of output if outsize > 2 (categorization problem), _
            also calculate softmax value if outlayer.utf = utfSoftMax
        If .OutLayer.outSize1 > 1 Then
            If .OutLayer.utf = UTFSoftMax Then
                num = 0
                For i = 1 To .OutLayer.outSize1
                    num = num + output(n).d(i, 1, 1)
                    If output(n).d(j, 1, 1) < output(n).d(i, 1, 1) Then j = i
                Next
                ' calculate the value of softmax, overflow if num = 0
                If num <> 0 Then
                    For i = 1 To .OutLayer.outSize1
                        output(n).d(i, 1, 1) = output(n).d(i, 1, 1) / num
                    Next
                End If
            Else
                For i = 1 To .OutLayer.outSize1
                    If output(n).d(j, 1, 1) < output(n).d(i, 1, 1) Then j = i
                Next
            End If
        End If
        CNNFP = j
    End With
End Function

Public Function CNNBP(net As NN, vec() As Double, target() As Double, Ate As Double, Optional m As Double = 0, Optional ClsCor As Byte = 0) As Double
' to update the weights and bias of given net according to the difference between target and output value
' ClsCur is a identifier that indicates if previous output classification equals to target classification, 0 means not equal, 1 means equal
' this function uses stochastic gradient decendant algorithm, learning speed Ate and momentum are given as parameters

Dim output() As MultiArr, out As Double, tar As Double, cur As Double
Dim inputs() As Double, tempIn() As Double, dUnit As NNUnit
Dim b As Double, num As Double, ErrorValue As Double
Dim vecSize As Long, labelSize As Long, outSize As Long, layercount As Long
Dim vecSize2 As Long, vecSize3 As Long, outSize2 As Long, outSize3 As Long
Dim layerSize As Long, layerSize2 As Long, layerSize3 As Long
Dim kRow As Long, kCol As Long, curRow As Long, curCol As Long, dRow As Long, dCol As Long, theRow As Long, theCol As Long
Dim preLayer As layer
Dim curLayer As layer
Dim layerDelta() As MultiArr, expndDelta() As Double ' stores all delta value in a group of deltas
Dim curDelta As MultiArr
Dim preDelta As MultiArr   ' data type to store all deltas before weights in a net is updated
Dim i As Long, j As Long, k As Long, l As Long, ii As Long, jj As Long, kk As Long, utf As Byte
Dim outClass As Long, tgtClass As Long

    layercount = getLayerCount(net)
    ErrorValue = 0                      ' calculate the total error
    ReDim layerDelta(1 To layercount)   ' set the layer delta structure
    If m > 0 Then deltaW = allZeroNN    ' deltaW is previous delta w of net
    With net.OutLayer
        outSize = .outSize1
        outSize2 = .outSize2
        outSize3 = .outSize3
        vecSize = UBound(vec, 1)
        vecSize2 = UBound(vec, 2)
        vecSize3 = UBound(vec, 3)
        
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        '' start to calculate the delta value of outlayer
        ReDim curDelta.d(1 To outSize, 1 To outSize2, 1 To outSize3)
        utf = .utf
        outClass = CNNFP(net, vec, output)       ' calculate the outputs of net
        tgtClass = 1
        tar = 0
        For i = 1 To outSize            ' calculates the error of output by calculating value of target - output for each output unit
            For j = 1 To outSize2
                For k = 1 To outSize3
                    out = output(layercount).d(i, j, k)
                    If target(i) > tar Then tgtClass = i
                    tar = target(i)
                    num = (tar - out)
                    ErrorValue = ErrorValue + num * num     ' calculate total error value
                    curDelta.d(i, j, k) = num * dUTFunction(utf, out)    ' delta is calculated via dfunction of utf
                Next
            Next
        Next
        If outClass = tgtClass Then ClsCor = 1
        ErrorValue = ErrorValue / 2
        layerDelta(layercount) = curDelta
    End With
    If layercount > 1 Then          ' there are hidden layers, delta should be calculated layer by layer and according to layer type
        
        '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
        '' start to calculate the delta value of hidden layers depending on their layer types
        preLayer = net.OutLayer                    ''''' let prelayer = outlayer
                                                'to calculate delta of current layer, the whole previous layer should be used to calculate
        preDelta = layerDelta(layercount)       ''''' let predelta = delta of outlayer
        '''''''''''''''''''''
        With preLayer
            For i = layercount - 1 To 1 Step -1     ' calculate from the last hidden layer back to the first hidden layer
                curLayer = net.hLayer(i)
                utf = curLayer.utf
                outSize = curLayer.outSize1
                outSize2 = curLayer.outSize2
                outSize3 = curLayer.outSize3
                ReDim curDelta.d(1 To outSize, 1 To outSize2, 1 To outSize3)
                Select Case .layerType      ' delta calculation is different for different types
                Case NNFullConLayer                 ' delta from full connection layer
                    For ii = 1 To outSize              ' calculate delta for all units in curlayer based on previous layer
                        For jj = 1 To outSize2
                            For kk = 1 To outSize3
                                cur = 0                         '  prepare to calculate the sum product
                                For k = 1 To UBound(preDelta.d) ' delta for each units is based on the sum of product of predelta and weight of the unit
                                    cur = cur + preDelta.d(k, 1, 1) * preLayer.units(k).w(ii, jj, kk)
                                Next
                                out = output(i).d(ii, jj, kk)
                                num = cur * dUTFunction(utf, out) ' delta value for each level is determined by the d function of utf of each layer
                                curDelta.d(ii, jj, kk) = num
                            Next
                        Next
                    Next
                Case NNConvLayer                    ' delta from convolutional layer is convolution of current weights on previous delta
                    kRow = .unitSize1
                    kCol = .unitSize2
                    dRow = .outSize1
                    dCol = .outSize2
                    expndDelta = mExpand(preDelta.d, kRow - 1)  ' expand the delta array, all channels will be expanded in the same time
                    For kk = 1 To outSize3
                        For ii = 1 To outSize
                            For jj = 1 To outSize2
                                cur = 0
                                For j = 1 To UBound(.units) ' per prev channel, add the convolution of prev w and prev delta
                                    ' sumproduct calculation
                                    For k = kRow To 1 Step -1
                                        For l = kCol To 1 Step -1
                                            cur = cur + .units(j).w(k, l, kk) * expndDelta(ii + kRow - k, jj + kCol - l, j)
                                        Next
                                    Next
                                Next
                                out = output(i).d(ii, jj, kk)
                                num = cur * dUTFunction(utf, out) ' delta value for each level is determined by the d function of utf of each layer
                                curDelta.d(ii, jj, kk) = num
                            Next
                        Next
                    Next
                Case NNPoolLayer                    ' delta from pooling layer is copied directly
                    kRow = .unitSize1
                    kCol = .unitSize2
                    For kk = 1 To outSize3
                        For ii = 1 To outSize / kRow
                            curRow = ii * kRow
                            For jj = 1 To outSize2 / kCol
                                num = preDelta.d(ii, jj, kk)
                                curCol = jj * kCol
                                For k = 0 To kRow - 1
                                    For l = 0 To kCol - 1
                                        curDelta.d(curRow - k, curCol - l, kk) = num
                                    Next
                                Next
                            Next
                        Next
                    Next
                End Select
                preLayer = curLayer
                preDelta = curDelta
                layerDelta(i) = curDelta
            Next
        End With
        '''''''''''' All deltas for each unit in all layers have been calculated
        '''''''''''' to update all weights
        inputs = vec
        For i = 1 To layercount - 1             ' update weights for each hidden layer
            With net.hLayer(i)
                layerSize = UBound(.units)
                ''''calculate the input size in 3 D for conv layer weight updating
                vecSize = .unitSize1 + .outSize1 - 1
                vecSize2 = .unitSize2 + .outSize2 - 1
                vecSize3 = .unitSize3 + .outSize3 - 1
                For j = 1 To layerSize              ' for each units in the layer, updateunits according to layer type
                    tempIn = inputs
                    Select Case .layerType
                    Case NNFullConLayer             ' unit = delta * input while input size = unit size
                        b = layerDelta(i).d(j, 1, 1) * Ate  ' update the bias value
                        .units(j) = updateUnit(.units(j), tempIn, 1, b, 3)  ' in this case b = delta*ate = ratio of weight update
                    Case NNConvLayer
                        'b = sum of delta
                        dUnit.b = .units(j).b   'temp unit used to update .units(j)
                        dUnit.w = .units(j).w
                        b = 0
                        For curRow = 1 To .outSize1
                            For curCol = 1 To .outSize2
                                b = b + layerDelta(i).d(curRow, curCol, j)
                            Next
                        Next
                        dUnit.b = b
                        'w = rot of delta conv on rot of input
                        For kk = 1 To .unitSize3         ' for each channel of unit(j) in layer(i)
                            For ii = 1 To .unitSize1     ' for each row in channel(kk)
                                For jj = 1 To .unitSize2 ' for each col in row(ii)
                                    ' the value is sumproduct of rot delta and rot input
                                    cur = 0 ' get ready to sum all weights
                                    For curRow = .outSize1 To 1 Step -1     ' for each row in the conv area = outsize = delta size
                                        For curCol = .outSize2 To 1 Step -1
                                            theRow = vecSize - .outSize1 + curRow - ii + 1
                                            theCol = vecSize2 - .outSize2 + curCol - jj + 1
                                            cur = cur + layerDelta(i).d(curRow, curCol, j) * tempIn(theRow, theCol, kk)
                                        Next
                                    Next
                                    dUnit.w(ii, jj, kk) = cur
                                Next
                            Next
                        Next
                        .units(j) = updateUnit(.units(j), dUnit.w, dUnit.b, Ate, 3)
                    Case NNPoolLayer
                        ' no weights to be updated in pooling layers
                    End Select
                Next
                inputs = output(i).d
            End With
        Next
    End If
    With net.OutLayer
        layerSize = UBound(.units) ' prepare the outlayer and update weights for each out units
        For j = 1 To layerSize   ' for each units in the outlayer, updateunits with Ate*delta* input
            tempIn = inputs
            b = layerDelta(layercount).d(j, 1, 1) * Ate
            .units(j) = updateUnit(.units(j), tempIn, 1, b, 3)
        Next
        ' update of all weights done, copy current delta weights to previous delta weights for next round updates
        If m > 0 Then previousDeltaW = deltaW
    End With
    CNNBP = ErrorValue
End Function

posted @ 2020-02-03 01:54 JackiePENG 阅读(45) 评论(0) 编辑收藏举报来源

刷新页面返回顶部

qteasy