softmax

https://blog.csdn.net/qq_36767053/article/details/108070348

https://blog.csdn.net/ft_sunshine/article/details/90221691


/*
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
    if [ ! -e $fname ]; then
        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
        gunzip ${fname}.gz
    fi
done
 */

#ifdef AA
#include "include/cuda_runtime_api.h"
#define __CUDACC__
#endif

#ifdef __CUDACC__
#define host __device__ __host__
#define device __device__
#define global __global__
#else
#define host
#define device
#define global
#endif


#define LENGTH_KERNEL  5

#define LENGTH_FEATURE0    32
#define LENGTH_FEATURE1    (LENGTH_FEATURE0 - LENGTH_KERNEL + 1)
#define LENGTH_FEATURE2    (LENGTH_FEATURE1 >> 1)
#define LENGTH_FEATURE3    (LENGTH_FEATURE2 - LENGTH_KERNEL + 1)
#define    LENGTH_FEATURE4    (LENGTH_FEATURE3 >> 1)
#define LENGTH_FEATURE5    (LENGTH_FEATURE4 - LENGTH_KERNEL + 1)

#define INPUT        1
#define LAYER1       6
#define LAYER2       6
#define LAYER3       16
#define LAYER4       16
#define LAYER5       120
#define OUTPUT          10

#define ALPHA 0.5
#define PADDING 2

typedef unsigned char uint8;
typedef uint8 image[28][28];


typedef struct LeNet5
{
    double weight0_1[INPUT][LAYER1][LENGTH_KERNEL][LENGTH_KERNEL];
    double weight2_3[LAYER2][LAYER3][LENGTH_KERNEL][LENGTH_KERNEL];
    double weight4_5[LAYER4][LAYER5][LENGTH_KERNEL][LENGTH_KERNEL];
    double weight5_6[LAYER5 * LENGTH_FEATURE5 * LENGTH_FEATURE5][OUTPUT];

    double bias0_1[LAYER1];
    double bias2_3[LAYER3];
    double bias4_5[LAYER5];
    double bias5_6[OUTPUT];

}LeNet5;

typedef struct Feature
{
    double input[INPUT][LENGTH_FEATURE0][LENGTH_FEATURE0];
    double layer1[LAYER1][LENGTH_FEATURE1][LENGTH_FEATURE1];
    double layer2[LAYER2][LENGTH_FEATURE2][LENGTH_FEATURE2];
    double layer3[LAYER3][LENGTH_FEATURE3][LENGTH_FEATURE3];
    double layer4[LAYER4][LENGTH_FEATURE4][LENGTH_FEATURE4];
    double layer5[LAYER5][LENGTH_FEATURE5][LENGTH_FEATURE5];
    double output[OUTPUT];
}Feature;


#define GETLENGTH(array) (sizeof(array)/sizeof(*(array)))

#define GETCOUNT(array)  (sizeof(array)/sizeof(double))

#define FOREACH(i,count) for (unsigned i = 0; i < count; ++i)

#define CONVOLUTE_VALID(input,output,weight)                                 \
{                                                                 \
   FOREACH(o0,GETLENGTH(output))                                         \
      FOREACH(o1,GETLENGTH(*(output)))                                    \
         FOREACH(w0,GETLENGTH(weight))                                   \
            FOREACH(w1,GETLENGTH(*(weight)))                              \
               (output)[o0][o1] += (input)[o0 + w0][o1 + w1] * (weight)[w0][w1];  \
}

#define CONVOLUTE_FULL(input,output,weight)                                     \
{                                                                 \
   FOREACH(i0,GETLENGTH(input))                                          \
      FOREACH(i1,GETLENGTH(*(input)))                                        \
         FOREACH(w0,GETLENGTH(weight))                                   \
            FOREACH(w1,GETLENGTH(*(weight)))                              \
               (output)[i0 + w0][i1 + w1] += (input)[i0][i1] * (weight)[w0][w1];  \
}

#define CONVOLUTION_FORWARD(input,output,weight,bias,action)               \
{                                                           \
   for (unsigned x = 0; x < GETLENGTH(weight); ++x)                           \
      for (unsigned y = 0; y < GETLENGTH(*weight); ++y)                    \
         CONVOLUTE_VALID(input[x], output[y], weight[x][y]);                \
   FOREACH(j, GETLENGTH(output))                                   \
      FOREACH(i, GETCOUNT(output[j]))                                  \
      ((double *)output[j])[i] = action(((double *)output[j])[i] + bias[j]); \
}

#define CONVOLUTION_BACKWARD(input,inerror,outerror,weight,wd,bd,actiongrad)\
{                                                        \
   for (unsigned x = 0; x < GETLENGTH(weight); ++x)                        \
      for (unsigned y = 0; y < GETLENGTH(*weight); ++y)                 \
         CONVOLUTE_FULL(outerror[y], inerror[x], weight[x][y]);       \
   FOREACH(i, GETCOUNT(inerror))                                \
      ((double *)inerror)[i] *= actiongrad(((double *)input)[i]);          \
   FOREACH(j, GETLENGTH(outerror))                                  \
      FOREACH(i, GETCOUNT(outerror[j]))                          \
      bd[j] += ((double *)outerror[j])[i];                        \
   for (unsigned x = 0; x < GETLENGTH(weight); ++x)                        \
      for (unsigned y = 0; y < GETLENGTH(*weight); ++y)                 \
         CONVOLUTE_VALID(input[x], wd[x][y], outerror[y]);           \
}


#define SUBSAMP_MAX_FORWARD(input,output)                                         \
{                                                                       \
   const int len0 = GETLENGTH(*(input)) / GETLENGTH(*(output));                        \
   const int len1 = GETLENGTH(**(input)) / GETLENGTH(**(output));                      \
   FOREACH(i, GETLENGTH(output))                                               \
   FOREACH(o0, GETLENGTH(*(output)))                                            \
   FOREACH(o1, GETLENGTH(**(output)))                                           \
   {                                                                    \
      int x0 = 0, x1 = 0, ismax;                                              \
      FOREACH(l0, len0)                                                     \
         FOREACH(l1, len1)                                                  \
      {                                                                 \
         ismax = input[i][o0*len0 + l0][o1*len1 + l1] > input[i][o0*len0 + x0][o1*len1 + x1];\
         x0 += ismax * (l0 - x0);                                             \
         x1 += ismax * (l1 - x1);                                             \
      }                                                                 \
      output[i][o0][o1] = input[i][o0*len0 + x0][o1*len1 + x1];                       \
   }                                                                    \
}

#define SUBSAMP_MAX_BACKWARD(input,inerror,outerror)                                 \
{                                                                       \
   const int len0 = GETLENGTH(*(inerror)) / GETLENGTH(*(outerror));                     \
   const int len1 = GETLENGTH(**(inerror)) / GETLENGTH(**(outerror));                   \
   FOREACH(i, GETLENGTH(outerror))                                                 \
   FOREACH(o0, GETLENGTH(*(outerror)))                                              \
   FOREACH(o1, GETLENGTH(**(outerror)))                                          \
   {                                                                    \
      int x0 = 0, x1 = 0, ismax;                                              \
      FOREACH(l0, len0)                                                     \
         FOREACH(l1, len1)                                                  \
      {                                                                 \
         ismax = input[i][o0*len0 + l0][o1*len1 + l1] > input[i][o0*len0 + x0][o1*len1 + x1];\
         x0 += ismax * (l0 - x0);                                             \
         x1 += ismax * (l1 - x1);                                             \
      }                                                                 \
      inerror[i][o0*len0 + x0][o1*len1 + x1] = outerror[i][o0][o1];                    \
   }                                                                    \
}

#define DOT_PRODUCT_FORWARD(input,output,weight,bias,action)            \
{                                                        \
   for (unsigned x = 0; x < GETLENGTH(weight); ++x)                        \
      for (unsigned y = 0; y < GETLENGTH(*weight); ++y)                 \
         ((double *)output)[y] += ((double *)input)[x] * weight[x][y];  \
   FOREACH(j, GETLENGTH(bias))                                     \
      ((double *)output)[j] = action(((double *)output)[j] + bias[j]);   \
}

#define DOT_PRODUCT_BACKWARD(input,inerror,outerror,weight,wd,bd,actiongrad)   \
{                                                           \
   for (unsigned x = 0; x < GETLENGTH(weight); ++x)                           \
      for (unsigned y = 0; y < GETLENGTH(*weight); ++y)                    \
         ((double *)inerror)[x] += ((double *)outerror)[y] * weight[x][y];  \
   FOREACH(i, GETCOUNT(inerror))                                   \
      ((double *)inerror)[i] *= actiongrad(((double *)input)[i]);             \
   FOREACH(j, GETLENGTH(outerror))                                     \
      bd[j] = ((double *)outerror)[j];                              \
   for (unsigned x = 0; x < GETLENGTH(weight); ++x)                           \
      for (unsigned y = 0; y < GETLENGTH(*weight); ++y)                    \
         wd[x][y] = ((double *)input)[x] * ((double *)outerror)[y];       \
}

host static double relu(double x)
{
    return x*(x > 0);
}

host static double relugrad(double y)
{
    return y > 0;
}

host static double dot(double y)
{
    return y;
}

host static inline void forward(LeNet5 *lenet, Feature *features, double(*action)(double))
{
    CONVOLUTION_FORWARD(features->input, features->layer1, lenet->weight0_1, lenet->bias0_1, action);
    SUBSAMP_MAX_FORWARD(features->layer1, features->layer2);
    CONVOLUTION_FORWARD(features->layer2, features->layer3, lenet->weight2_3, lenet->bias2_3, action);
    SUBSAMP_MAX_FORWARD(features->layer3, features->layer4);
    CONVOLUTION_FORWARD(features->layer4, features->layer5, lenet->weight4_5, lenet->bias4_5, action);
    DOT_PRODUCT_FORWARD(features->layer5, features->output, lenet->weight5_6, lenet->bias5_6, dot);
}

host static inline void backward(LeNet5 *lenet, LeNet5 *deltas, Feature *errors, Feature *features, double(*actiongrad)(double))
{
    DOT_PRODUCT_BACKWARD(features->layer5, errors->layer5, errors->output, lenet->weight5_6, deltas->weight5_6, deltas->bias5_6, actiongrad);
    CONVOLUTION_BACKWARD(features->layer4, errors->layer4, errors->layer5, lenet->weight4_5, deltas->weight4_5, deltas->bias4_5, actiongrad);
    SUBSAMP_MAX_BACKWARD(features->layer3, errors->layer3, errors->layer4);
    CONVOLUTION_BACKWARD(features->layer2, errors->layer2, errors->layer3, lenet->weight2_3, deltas->weight2_3, deltas->bias2_3, actiongrad);
    SUBSAMP_MAX_BACKWARD(features->layer1, errors->layer1, errors->layer2);
    CONVOLUTION_BACKWARD(features->input, errors->input, errors->layer1, lenet->weight0_1, deltas->weight0_1, deltas->bias0_1, actiongrad);
}


#include <math.h>
host static inline void load_input(Feature *features, image input)
{
    double (*layer0)[LENGTH_FEATURE0][LENGTH_FEATURE0] = features->input;
    const long sz = sizeof(image) / sizeof(**input);
    double mean = 0, std = 0;
    FOREACH(j, sizeof(image) / sizeof(*input))
        FOREACH(k, sizeof(*input) / sizeof(**input))
        {
            mean += input[j][k];
            std += input[j][k] * input[j][k];
        }
    mean /= sz;
    std = sqrt(std / sz - mean*mean);
    FOREACH(j, sizeof(image) / sizeof(*input))
        FOREACH(k, sizeof(*input) / sizeof(**input))
        {
            layer0[0][j + PADDING][k + PADDING] = (input[j][k] - mean) / std;
        }
}

host static inline void softmax(double input[OUTPUT], double loss[OUTPUT], unsigned label, unsigned count)
{
    double inner = 0;
    for (unsigned i = 0; i < count; ++i)
    {
        double res = 0;
        for (unsigned j = 0; j < count; ++j)
        {
            res += exp(input[j] - input[i]);
        }
        loss[i] = 1. / res;
        inner -= loss[i] * loss[i];
    }
    inner += loss[label];
    for (unsigned i = 0; i < count; ++i)
    {
        loss[i] *= (i == label) - loss[i] - inner;
    }
}

host static inline void load_target(Feature *features, Feature *errors, int label)
{
    double *output = (double *)features->output;
    double *error = (double *)errors->output;
    softmax(output, error, label, GETCOUNT(features->output));
}

static inline uint8 get_result(Feature *features)
{
    double *output = (double *)features->output;
    const int count = GETCOUNT(features->output);
    uint8 result = 0;
    double maxvalue = *output;
    for (uint8 i = 1; i < count; ++i)
    {
        if (output[i] > maxvalue)
        {
            maxvalue = output[i];
            result = i;
        }
    }
    return result;
}

#include "memory.h"
global void TrainBatch(LeNet5 *lenet, image *inputs, uint8 *labels, unsigned i, void *buf)
{
#ifdef __CUDACC__
    i = threadIdx.x + blockIdx.x * blockDim.x;
#endif
    Feature features;
    Feature errors;
    LeNet5 *deltas = (LeNet5 *)buf + i;
    memset(&features, 0, sizeof features);
    memset(&errors, 0, sizeof errors);
    memset(deltas, 0, sizeof(LeNet5));
    load_input(&features, inputs[i]);
    forward(lenet, &features, relu);
    load_target(&features, &errors, labels[i]);
    backward(lenet, deltas, &errors, &features, relugrad);
}

static inline void Train(LeNet5 *lenet, image input, uint8 label)
{
    Feature features;
    Feature errors;
    LeNet5 deltas;
    memset(&features, 0, sizeof features);
    memset(&errors, 0, sizeof errors);
    memset(&deltas, 0, sizeof deltas);

    load_input(&features, input);
    forward(lenet, &features, relu);
    load_target(&features, &errors, label);
    backward(lenet, &deltas, &errors, &features, relugrad);
    FOREACH(i, GETCOUNT(LeNet5))
        ((double *)lenet)[i] += ALPHA * ((double *)&deltas)[i];
}

static inline uint8 Predict(LeNet5 *lenet, image input)
{
    Feature features;
    memset(&features, 0, sizeof features);
    load_input(&features, input);
    forward(lenet, &features, relu);
    return get_result(&features);
}


#include <time.h>
#include <stdlib.h>
static int randbit = 0;
static inline double f64rand()
{
    if (!randbit)
    {
        srand((unsigned)time(0));
        for (unsigned i = RAND_MAX; i; i >>= 1, ++randbit);
    }
    unsigned long long lvalue = 0x4000000000000000L;
    int i = 52 - randbit;
    for (; i > 0; i -= randbit)
        lvalue |= (unsigned long long)rand() << i;
    lvalue |= (unsigned long long)rand() >> -i;
    return *(double *)&lvalue - 3;
}

static void Initial(LeNet5 *lenet)
{
    for (double *pos = (double *)lenet->weight0_1; pos < (double *)lenet->bias0_1; *pos++ = f64rand());
    for (double *pos = (double *)lenet->weight0_1; pos < (double *)lenet->weight2_3; *pos++ *= sqrt(6.0 / (LENGTH_KERNEL * LENGTH_KERNEL * (INPUT + LAYER1))));
    for (double *pos = (double *)lenet->weight2_3; pos < (double *)lenet->weight4_5; *pos++ *= sqrt(6.0 / (LENGTH_KERNEL * LENGTH_KERNEL * (LAYER2 + LAYER3))));
    for (double *pos = (double *)lenet->weight4_5; pos < (double *)lenet->weight5_6; *pos++ *= sqrt(6.0 / (LENGTH_KERNEL * LENGTH_KERNEL * (LAYER4 + LAYER5))));
    for (double *pos = (double *)lenet->weight5_6; pos < (double *)lenet->bias0_1; *pos++ *= sqrt(6.0 / (LAYER5 + OUTPUT)));
    for (int *pos = (int *)lenet->bias0_1; pos < (int *)(lenet + 1); *pos++ = 0);
}


#define FILE_TRAIN_IMAGE      "train-images-idx3-ubyte"
#define FILE_TRAIN_LABEL      "train-labels-idx1-ubyte"
#define FILE_TEST_IMAGE       "t10k-images-idx3-ubyte"
#define FILE_TEST_LABEL       "t10k-labels-idx1-ubyte"
#define LENET_FILE        "model.dat"
#define COUNT_TRAIN       60000
#define COUNT_TEST    10000

#include <stdio.h>

int read_data(unsigned char(*data)[28][28], unsigned char label[], const int count, const char data_file[], const char label_file[])
{
    FILE *fp_image = fopen(data_file, "rb");
    FILE *fp_label = fopen(label_file, "rb");
    if (!fp_image||!fp_label) return -1;
    fseek(fp_image, 16, SEEK_SET);
    fseek(fp_label, 8, SEEK_SET);
    fread(data, sizeof(image)*count, 1, fp_image);
    fread(label, count, 1, fp_label);
    fclose(fp_image);
    fclose(fp_label);
    return 0;
}

int training(LeNet5 *lenet, image *train_data, uint8 *train_label, unsigned batch_size, unsigned total_size)
{
    fprintf(stdout, "batchsize:%d\ttrain:%2d%%", batch_size, 0);
    double *buffer = (double *) malloc(sizeof(LeNet5)*batch_size);
#ifdef __CUDACC__
    cudaError_t r = cudaSuccess;
    void *buf;
    if (r = cudaMalloc(&buf, sizeof(LeNet5)*(batch_size+1) + (sizeof(image)+sizeof(uint8))*total_size))
        return r;
    double *bufferG = (double *)buf;
    LeNet5 *lenetG = (LeNet5 *)buf + batch_size;
    image *train_dataG = (image *)(lenetG + 1);
    uint8 *train_labelG = (uint8 *)(train_dataG + total_size);

    if (r = cudaMemcpy(train_dataG, train_data, sizeof(image)*total_size, cudaMemcpyHostToDevice))
        return r;
    if (r = cudaMemcpy(train_labelG, train_label, sizeof(uint8)*total_size, cudaMemcpyHostToDevice))
        return r;
#endif

    for (unsigned i = 0, percent = 0; i <= total_size - batch_size; i += batch_size)
    {
#ifdef __CUDACC__
        if (r = cudaMemcpy(lenetG, lenet, sizeof(LeNet5), cudaMemcpyHostToDevice))
            return r;
        const int block = 32;
        TrainBatch<<<batch_size/block, block>>>(lenetG, train_dataG + i, train_labelG + i, batch_size, bufferG);
        cudaError_t e = cudaGetLastError();
        if (e) {
            fprintf(stdout, "\nGPU: %d (%s)\n", e, cudaGetErrorString(e));
            return e;
        }

        if (r = cudaDeviceSynchronize())
            return r;
        if (r = cudaMemcpy(buffer, bufferG, sizeof(LeNet5)*batch_size, cudaMemcpyDeviceToHost))
            return r;
#else
#pragma omp parallel for
        for (unsigned j = 0; j < batch_size; j++)
            TrainBatch(lenet, train_data + i, train_label + i, j, buffer);
#endif
        for (unsigned i = 1; i < batch_size; ++i) {
            double *p = (double *)((uint8 *)buffer + i * sizeof(LeNet5));
            FOREACH(j, GETCOUNT(LeNet5))
                buffer[j] += p[j];
        }
        double k = ALPHA / batch_size;
        FOREACH(i, GETCOUNT(LeNet5))
            ((double *)lenet)[i] += k * buffer[i];

        if (i * 100 / total_size > percent){
            fprintf(stdout, "\b\b\b%2d%%", percent = i * 100 / total_size);
            fflush(stdout);
        }
    }

#ifdef __CUDACC__
    cudaFree(buf);
#endif
    free(buffer);
    fprintf(stdout, "\n");
    return 0;
}

int testing(LeNet5 *lenet, image *test_data, uint8 *test_label, unsigned total_size)
{
    int right = 0;
#pragma omp parallel for
    for (unsigned i = 0; i < total_size; ++i)
    {
        uint8 l = test_label[i];
        uint8 p = Predict(lenet, test_data[i]);
        int r = l == p;
#pragma omp critical
        right += r;
    }
    return right;
}

int save(LeNet5 *lenet, const char *filename)
{
    FILE *fp = fopen(filename, "wb");
    if (!fp) return 1;
    fwrite(lenet, 1, sizeof(LeNet5), fp);
    fclose(fp);
    return 0;
}

int load(LeNet5 *lenet, const char *filename)
{
    FILE *fp = fopen(filename, "rb");
    if (!fp) return -1;
    fread(lenet, 1, sizeof(LeNet5), fp);
    fclose(fp);
    return 0;
}

int main()
{
    image *train_data = (image *)calloc(COUNT_TRAIN, sizeof(image));
    uint8 *train_label = (uint8 *)calloc(COUNT_TRAIN, sizeof(uint8));
    if (read_data(train_data, train_label, COUNT_TRAIN, FILE_TRAIN_IMAGE, FILE_TRAIN_LABEL))
    {
        free(train_data);
        free(train_label);
        return -1;
    }
    image *test_data = (image *)calloc(COUNT_TEST, sizeof(image));
    uint8 *test_label = (uint8 *)calloc(COUNT_TEST, sizeof(uint8));
    if (read_data(test_data, test_label, COUNT_TEST, FILE_TEST_IMAGE, FILE_TEST_LABEL))
    {
        free(test_data);
        free(test_label);
        return -1;
    }

    clock_t start = clock();
    LeNet5 *lenet = (LeNet5 *)malloc(sizeof(LeNet5));
    int create = load(lenet, LENET_FILE);
    if (create) {
        Initial(lenet);
        int batches[] = { 384 };
        for (unsigned i = 0; i < sizeof(batches) / sizeof(*batches);++i) {
            if (training(lenet, train_data, train_label, batches[i], COUNT_TRAIN))
                return -2;
        }
        save(lenet, LENET_FILE);
    }
    int right = testing(lenet, test_data, test_label, COUNT_TEST);
    fprintf(stdout, "Res: %d/%d (%.2f%%)\n", right, COUNT_TEST, (float)right*100/COUNT_TEST);
    fprintf(stdout, "Time: %.2fs\n", (float)(clock() - start) / CLOCKS_PER_SEC);

    free(lenet);
    free(train_data);
    free(train_label);
    free(test_data);
    free(test_label);
    return 0;
}

posted @ 2022-07-29 15:39 zJanly 阅读(53) 评论(0) 编辑收藏举报

刷新页面返回顶部

zJanly

lenet5

公告