lenet5
//https://blog.csdn.net/The_lastest/article/details/109611211
//https://blog.csdn.net/baidu_40840693/article/details/82958911
softmax
https://blog.csdn.net/qq_36767053/article/details/108070348
bp
/*
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
*/
#ifdef AA
#include "include/cuda_runtime_api.h"
#define __CUDACC__
#endif
#ifdef __CUDACC__
#define host __device__ __host__
#define device __device__
#define global __global__
#else
#define host
#define device
#define global
#endif
#define LENGTH_KERNEL 5
#define LENGTH_FEATURE0 32
#define LENGTH_FEATURE1 (LENGTH_FEATURE0 - LENGTH_KERNEL + 1)
#define LENGTH_FEATURE2 (LENGTH_FEATURE1 >> 1)
#define LENGTH_FEATURE3 (LENGTH_FEATURE2 - LENGTH_KERNEL + 1)
#define LENGTH_FEATURE4 (LENGTH_FEATURE3 >> 1)
#define LENGTH_FEATURE5 (LENGTH_FEATURE4 - LENGTH_KERNEL + 1)
#define INPUT 1
#define LAYER1 6
#define LAYER2 6
#define LAYER3 16
#define LAYER4 16
#define LAYER5 120
#define OUTPUT 10
#define ALPHA 0.5
#define PADDING 2
typedef unsigned char uint8;
typedef uint8 image[28][28];
typedef struct LeNet5
{
double weight0_1[INPUT][LAYER1][LENGTH_KERNEL][LENGTH_KERNEL];
double weight2_3[LAYER2][LAYER3][LENGTH_KERNEL][LENGTH_KERNEL];
double weight4_5[LAYER4][LAYER5][LENGTH_KERNEL][LENGTH_KERNEL];
double weight5_6[LAYER5 * LENGTH_FEATURE5 * LENGTH_FEATURE5][OUTPUT];
double bias0_1[LAYER1];
double bias2_3[LAYER3];
double bias4_5[LAYER5];
double bias5_6[OUTPUT];
}LeNet5;
typedef struct Feature
{
double input[INPUT][LENGTH_FEATURE0][LENGTH_FEATURE0];
double layer1[LAYER1][LENGTH_FEATURE1][LENGTH_FEATURE1];
double layer2[LAYER2][LENGTH_FEATURE2][LENGTH_FEATURE2];
double layer3[LAYER3][LENGTH_FEATURE3][LENGTH_FEATURE3];
double layer4[LAYER4][LENGTH_FEATURE4][LENGTH_FEATURE4];
double layer5[LAYER5][LENGTH_FEATURE5][LENGTH_FEATURE5];
double output[OUTPUT];
}Feature;
#define GETLENGTH(array) (sizeof(array)/sizeof(*(array)))
#define GETCOUNT(array) (sizeof(array)/sizeof(double))
#define FOREACH(i,count) for (unsigned i = 0; i < count; ++i)
#define CONVOLUTE_VALID(input,output,weight) \
{ \
FOREACH(o0,GETLENGTH(output)) \
FOREACH(o1,GETLENGTH(*(output))) \
FOREACH(w0,GETLENGTH(weight)) \
FOREACH(w1,GETLENGTH(*(weight))) \
(output)[o0][o1] += (input)[o0 + w0][o1 + w1] * (weight)[w0][w1]; \
}
#define CONVOLUTE_FULL(input,output,weight) \
{ \
FOREACH(i0,GETLENGTH(input)) \
FOREACH(i1,GETLENGTH(*(input))) \
FOREACH(w0,GETLENGTH(weight)) \
FOREACH(w1,GETLENGTH(*(weight))) \
(output)[i0 + w0][i1 + w1] += (input)[i0][i1] * (weight)[w0][w1]; \
}
#define CONVOLUTION_FORWARD(input,output,weight,bias,action) \
{ \
for (unsigned x = 0; x < GETLENGTH(weight); ++x) \
for (unsigned y = 0; y < GETLENGTH(*weight); ++y) \
CONVOLUTE_VALID(input[x], output[y], weight[x][y]); \
FOREACH(j, GETLENGTH(output)) \
FOREACH(i, GETCOUNT(output[j])) \
((double *)output[j])[i] = action(((double *)output[j])[i] + bias[j]); \
}
#define CONVOLUTION_BACKWARD(input,inerror,outerror,weight,wd,bd,actiongrad)\
{ \
for (unsigned x = 0; x < GETLENGTH(weight); ++x) \
for (unsigned y = 0; y < GETLENGTH(*weight); ++y) \
CONVOLUTE_FULL(outerror[y], inerror[x], weight[x][y]); \
FOREACH(i, GETCOUNT(inerror)) \
((double *)inerror)[i] *= actiongrad(((double *)input)[i]); \
FOREACH(j, GETLENGTH(outerror)) \
FOREACH(i, GETCOUNT(outerror[j])) \
bd[j] += ((double *)outerror[j])[i]; \
for (unsigned x = 0; x < GETLENGTH(weight); ++x) \
for (unsigned y = 0; y < GETLENGTH(*weight); ++y) \
CONVOLUTE_VALID(input[x], wd[x][y], outerror[y]); \
}
#define SUBSAMP_MAX_FORWARD(input,output) \
{ \
const int len0 = GETLENGTH(*(input)) / GETLENGTH(*(output)); \
const int len1 = GETLENGTH(**(input)) / GETLENGTH(**(output)); \
FOREACH(i, GETLENGTH(output)) \
FOREACH(o0, GETLENGTH(*(output))) \
FOREACH(o1, GETLENGTH(**(output))) \
{ \
int x0 = 0, x1 = 0, ismax; \
FOREACH(l0, len0) \
FOREACH(l1, len1) \
{ \
ismax = input[i][o0*len0 + l0][o1*len1 + l1] > input[i][o0*len0 + x0][o1*len1 + x1];\
x0 += ismax * (l0 - x0); \
x1 += ismax * (l1 - x1); \
} \
output[i][o0][o1] = input[i][o0*len0 + x0][o1*len1 + x1]; \
} \
}
#define SUBSAMP_MAX_BACKWARD(input,inerror,outerror) \
{ \
const int len0 = GETLENGTH(*(inerror)) / GETLENGTH(*(outerror)); \
const int len1 = GETLENGTH(**(inerror)) / GETLENGTH(**(outerror)); \
FOREACH(i, GETLENGTH(outerror)) \
FOREACH(o0, GETLENGTH(*(outerror))) \
FOREACH(o1, GETLENGTH(**(outerror))) \
{ \
int x0 = 0, x1 = 0, ismax; \
FOREACH(l0, len0) \
FOREACH(l1, len1) \
{ \
ismax = input[i][o0*len0 + l0][o1*len1 + l1] > input[i][o0*len0 + x0][o1*len1 + x1];\
x0 += ismax * (l0 - x0); \
x1 += ismax * (l1 - x1); \
} \
inerror[i][o0*len0 + x0][o1*len1 + x1] = outerror[i][o0][o1]; \
} \
}
#define DOT_PRODUCT_FORWARD(input,output,weight,bias,action) \
{ \
for (unsigned x = 0; x < GETLENGTH(weight); ++x) \
for (unsigned y = 0; y < GETLENGTH(*weight); ++y) \
((double *)output)[y] += ((double *)input)[x] * weight[x][y]; \
FOREACH(j, GETLENGTH(bias)) \
((double *)output)[j] = action(((double *)output)[j] + bias[j]); \
}
#define DOT_PRODUCT_BACKWARD(input,inerror,outerror,weight,wd,bd,actiongrad) \
{ \
for (unsigned x = 0; x < GETLENGTH(weight); ++x) \
for (unsigned y = 0; y < GETLENGTH(*weight); ++y) \
((double *)inerror)[x] += ((double *)outerror)[y] * weight[x][y]; \
FOREACH(i, GETCOUNT(inerror)) \
((double *)inerror)[i] *= actiongrad(((double *)input)[i]); \
FOREACH(j, GETLENGTH(outerror)) \
bd[j] = ((double *)outerror)[j]; \
for (unsigned x = 0; x < GETLENGTH(weight); ++x) \
for (unsigned y = 0; y < GETLENGTH(*weight); ++y) \
wd[x][y] = ((double *)input)[x] * ((double *)outerror)[y]; \
}
host static double relu(double x)
{
return x*(x > 0);
}
host static double relugrad(double y)
{
return y > 0;
}
host static double dot(double y)
{
return y;
}
host static inline void forward(LeNet5 *lenet, Feature *features, double(*action)(double))
{
CONVOLUTION_FORWARD(features->input, features->layer1, lenet->weight0_1, lenet->bias0_1, action);
SUBSAMP_MAX_FORWARD(features->layer1, features->layer2);
CONVOLUTION_FORWARD(features->layer2, features->layer3, lenet->weight2_3, lenet->bias2_3, action);
SUBSAMP_MAX_FORWARD(features->layer3, features->layer4);
CONVOLUTION_FORWARD(features->layer4, features->layer5, lenet->weight4_5, lenet->bias4_5, action);
DOT_PRODUCT_FORWARD(features->layer5, features->output, lenet->weight5_6, lenet->bias5_6, dot);
}
host static inline void backward(LeNet5 *lenet, LeNet5 *deltas, Feature *errors, Feature *features, double(*actiongrad)(double))
{
DOT_PRODUCT_BACKWARD(features->layer5, errors->layer5, errors->output, lenet->weight5_6, deltas->weight5_6, deltas->bias5_6, actiongrad);
CONVOLUTION_BACKWARD(features->layer4, errors->layer4, errors->layer5, lenet->weight4_5, deltas->weight4_5, deltas->bias4_5, actiongrad);
SUBSAMP_MAX_BACKWARD(features->layer3, errors->layer3, errors->layer4);
CONVOLUTION_BACKWARD(features->layer2, errors->layer2, errors->layer3, lenet->weight2_3, deltas->weight2_3, deltas->bias2_3, actiongrad);
SUBSAMP_MAX_BACKWARD(features->layer1, errors->layer1, errors->layer2);
CONVOLUTION_BACKWARD(features->input, errors->input, errors->layer1, lenet->weight0_1, deltas->weight0_1, deltas->bias0_1, actiongrad);
}
#include <math.h>
host static inline void load_input(Feature *features, image input)
{
double (*layer0)[LENGTH_FEATURE0][LENGTH_FEATURE0] = features->input;
const long sz = sizeof(image) / sizeof(**input);
double mean = 0, std = 0;
FOREACH(j, sizeof(image) / sizeof(*input))
FOREACH(k, sizeof(*input) / sizeof(**input))
{
mean += input[j][k];
std += input[j][k] * input[j][k];
}
mean /= sz;
std = sqrt(std / sz - mean*mean);
FOREACH(j, sizeof(image) / sizeof(*input))
FOREACH(k, sizeof(*input) / sizeof(**input))
{
layer0[0][j + PADDING][k + PADDING] = (input[j][k] - mean) / std;
}
}
host static inline void softmax(double input[OUTPUT], double loss[OUTPUT], unsigned label, unsigned count)
{
double inner = 0;
for (unsigned i = 0; i < count; ++i)
{
double res = 0;
for (unsigned j = 0; j < count; ++j)
{
res += exp(input[j] - input[i]);
}
loss[i] = 1. / res;
inner -= loss[i] * loss[i];
}
inner += loss[label];
for (unsigned i = 0; i < count; ++i)
{
loss[i] *= (i == label) - loss[i] - inner;
}
}
host static inline void load_target(Feature *features, Feature *errors, int label)
{
double *output = (double *)features->output;
double *error = (double *)errors->output;
softmax(output, error, label, GETCOUNT(features->output));
}
static inline uint8 get_result(Feature *features)
{
double *output = (double *)features->output;
const int count = GETCOUNT(features->output);
uint8 result = 0;
double maxvalue = *output;
for (uint8 i = 1; i < count; ++i)
{
if (output[i] > maxvalue)
{
maxvalue = output[i];
result = i;
}
}
return result;
}
#include "memory.h"
global void TrainBatch(LeNet5 *lenet, image *inputs, uint8 *labels, unsigned i, void *buf)
{
#ifdef __CUDACC__
i = threadIdx.x + blockIdx.x * blockDim.x;
#endif
Feature features;
Feature errors;
LeNet5 *deltas = (LeNet5 *)buf + i;
memset(&features, 0, sizeof features);
memset(&errors, 0, sizeof errors);
memset(deltas, 0, sizeof(LeNet5));
load_input(&features, inputs[i]);
forward(lenet, &features, relu);
load_target(&features, &errors, labels[i]);
backward(lenet, deltas, &errors, &features, relugrad);
}
static inline void Train(LeNet5 *lenet, image input, uint8 label)
{
Feature features;
Feature errors;
LeNet5 deltas;
memset(&features, 0, sizeof features);
memset(&errors, 0, sizeof errors);
memset(&deltas, 0, sizeof deltas);
load_input(&features, input);
forward(lenet, &features, relu);
load_target(&features, &errors, label);
backward(lenet, &deltas, &errors, &features, relugrad);
FOREACH(i, GETCOUNT(LeNet5))
((double *)lenet)[i] += ALPHA * ((double *)&deltas)[i];
}
static inline uint8 Predict(LeNet5 *lenet, image input)
{
Feature features;
memset(&features, 0, sizeof features);
load_input(&features, input);
forward(lenet, &features, relu);
return get_result(&features);
}
#include <time.h>
#include <stdlib.h>
static int randbit = 0;
static inline double f64rand()
{
if (!randbit)
{
srand((unsigned)time(0));
for (unsigned i = RAND_MAX; i; i >>= 1, ++randbit);
}
unsigned long long lvalue = 0x4000000000000000L;
int i = 52 - randbit;
for (; i > 0; i -= randbit)
lvalue |= (unsigned long long)rand() << i;
lvalue |= (unsigned long long)rand() >> -i;
return *(double *)&lvalue - 3;
}
static void Initial(LeNet5 *lenet)
{
for (double *pos = (double *)lenet->weight0_1; pos < (double *)lenet->bias0_1; *pos++ = f64rand());
for (double *pos = (double *)lenet->weight0_1; pos < (double *)lenet->weight2_3; *pos++ *= sqrt(6.0 / (LENGTH_KERNEL * LENGTH_KERNEL * (INPUT + LAYER1))));
for (double *pos = (double *)lenet->weight2_3; pos < (double *)lenet->weight4_5; *pos++ *= sqrt(6.0 / (LENGTH_KERNEL * LENGTH_KERNEL * (LAYER2 + LAYER3))));
for (double *pos = (double *)lenet->weight4_5; pos < (double *)lenet->weight5_6; *pos++ *= sqrt(6.0 / (LENGTH_KERNEL * LENGTH_KERNEL * (LAYER4 + LAYER5))));
for (double *pos = (double *)lenet->weight5_6; pos < (double *)lenet->bias0_1; *pos++ *= sqrt(6.0 / (LAYER5 + OUTPUT)));
for (int *pos = (int *)lenet->bias0_1; pos < (int *)(lenet + 1); *pos++ = 0);
}
#define FILE_TRAIN_IMAGE "train-images-idx3-ubyte"
#define FILE_TRAIN_LABEL "train-labels-idx1-ubyte"
#define FILE_TEST_IMAGE "t10k-images-idx3-ubyte"
#define FILE_TEST_LABEL "t10k-labels-idx1-ubyte"
#define LENET_FILE "model.dat"
#define COUNT_TRAIN 60000
#define COUNT_TEST 10000
#include <stdio.h>
int read_data(unsigned char(*data)[28][28], unsigned char label[], const int count, const char data_file[], const char label_file[])
{
FILE *fp_image = fopen(data_file, "rb");
FILE *fp_label = fopen(label_file, "rb");
if (!fp_image||!fp_label) return -1;
fseek(fp_image, 16, SEEK_SET);
fseek(fp_label, 8, SEEK_SET);
fread(data, sizeof(image)*count, 1, fp_image);
fread(label, count, 1, fp_label);
fclose(fp_image);
fclose(fp_label);
return 0;
}
int training(LeNet5 *lenet, image *train_data, uint8 *train_label, unsigned batch_size, unsigned total_size)
{
fprintf(stdout, "batchsize:%d\ttrain:%2d%%", batch_size, 0);
double *buffer = (double *) malloc(sizeof(LeNet5)*batch_size);
#ifdef __CUDACC__
cudaError_t r = cudaSuccess;
void *buf;
if (r = cudaMalloc(&buf, sizeof(LeNet5)*(batch_size+1) + (sizeof(image)+sizeof(uint8))*total_size))
return r;
double *bufferG = (double *)buf;
LeNet5 *lenetG = (LeNet5 *)buf + batch_size;
image *train_dataG = (image *)(lenetG + 1);
uint8 *train_labelG = (uint8 *)(train_dataG + total_size);
if (r = cudaMemcpy(train_dataG, train_data, sizeof(image)*total_size, cudaMemcpyHostToDevice))
return r;
if (r = cudaMemcpy(train_labelG, train_label, sizeof(uint8)*total_size, cudaMemcpyHostToDevice))
return r;
#endif
for (unsigned i = 0, percent = 0; i <= total_size - batch_size; i += batch_size)
{
#ifdef __CUDACC__
if (r = cudaMemcpy(lenetG, lenet, sizeof(LeNet5), cudaMemcpyHostToDevice))
return r;
const int block = 32;
TrainBatch<<<batch_size/block, block>>>(lenetG, train_dataG + i, train_labelG + i, batch_size, bufferG);
cudaError_t e = cudaGetLastError();
if (e) {
fprintf(stdout, "\nGPU: %d (%s)\n", e, cudaGetErrorString(e));
return e;
}
if (r = cudaDeviceSynchronize())
return r;
if (r = cudaMemcpy(buffer, bufferG, sizeof(LeNet5)*batch_size, cudaMemcpyDeviceToHost))
return r;
#else
#pragma omp parallel for
for (unsigned j = 0; j < batch_size; j++)
TrainBatch(lenet, train_data + i, train_label + i, j, buffer);
#endif
for (unsigned i = 1; i < batch_size; ++i) {
double *p = (double *)((uint8 *)buffer + i * sizeof(LeNet5));
FOREACH(j, GETCOUNT(LeNet5))
buffer[j] += p[j];
}
double k = ALPHA / batch_size;
FOREACH(i, GETCOUNT(LeNet5))
((double *)lenet)[i] += k * buffer[i];
if (i * 100 / total_size > percent){
fprintf(stdout, "\b\b\b%2d%%", percent = i * 100 / total_size);
fflush(stdout);
}
}
#ifdef __CUDACC__
cudaFree(buf);
#endif
free(buffer);
fprintf(stdout, "\n");
return 0;
}
int testing(LeNet5 *lenet, image *test_data, uint8 *test_label, unsigned total_size)
{
int right = 0;
#pragma omp parallel for
for (unsigned i = 0; i < total_size; ++i)
{
uint8 l = test_label[i];
uint8 p = Predict(lenet, test_data[i]);
int r = l == p;
#pragma omp critical
right += r;
}
return right;
}
int save(LeNet5 *lenet, const char *filename)
{
FILE *fp = fopen(filename, "wb");
if (!fp) return 1;
fwrite(lenet, 1, sizeof(LeNet5), fp);
fclose(fp);
return 0;
}
int load(LeNet5 *lenet, const char *filename)
{
FILE *fp = fopen(filename, "rb");
if (!fp) return -1;
fread(lenet, 1, sizeof(LeNet5), fp);
fclose(fp);
return 0;
}
int main()
{
image *train_data = (image *)calloc(COUNT_TRAIN, sizeof(image));
uint8 *train_label = (uint8 *)calloc(COUNT_TRAIN, sizeof(uint8));
if (read_data(train_data, train_label, COUNT_TRAIN, FILE_TRAIN_IMAGE, FILE_TRAIN_LABEL))
{
free(train_data);
free(train_label);
return -1;
}
image *test_data = (image *)calloc(COUNT_TEST, sizeof(image));
uint8 *test_label = (uint8 *)calloc(COUNT_TEST, sizeof(uint8));
if (read_data(test_data, test_label, COUNT_TEST, FILE_TEST_IMAGE, FILE_TEST_LABEL))
{
free(test_data);
free(test_label);
return -1;
}
clock_t start = clock();
LeNet5 *lenet = (LeNet5 *)malloc(sizeof(LeNet5));
int create = load(lenet, LENET_FILE);
if (create) {
Initial(lenet);
int batches[] = { 384 };
for (unsigned i = 0; i < sizeof(batches) / sizeof(*batches);++i) {
if (training(lenet, train_data, train_label, batches[i], COUNT_TRAIN))
return -2;
}
save(lenet, LENET_FILE);
}
int right = testing(lenet, test_data, test_label, COUNT_TEST);
fprintf(stdout, "Res: %d/%d (%.2f%%)\n", right, COUNT_TEST, (float)right*100/COUNT_TEST);
fprintf(stdout, "Time: %.2fs\n", (float)(clock() - start) / CLOCKS_PER_SEC);
free(lenet);
free(train_data);
free(train_label);
free(test_data);
free(test_label);
return 0;
}