c++和cuda混合编程 实现传统神经网络
直接放代码了。。。
实现的是x1+x2=y的预测,但梯度下降很慢。。。233333,gpu运行时间很快!!
// // main.cpp // bp // // Created by jzc on 2018/4/18. // Copyright © 2018年 jzc. All rights reserved. // #include <stdio.h> #include <iostream> #include <time.h> #include <stdlib.h> #include <math.h> #include <fstream> #include <cuda_runtime.h> using namespace std; #define DATASIZE 10000 #define TESTSIZE 100 #define NEURESIZE 50 #define RW 0.1 #define EPOCH 1000 #define E 2.71828 //打印设备信息 void printDeviceProp(const cudaDeviceProp &prop) { printf("Device Name : %s.\n", prop.name); printf("totalGlobalMem : %ld.\n", prop.totalGlobalMem); printf("sharedMemPerBlock : %ld.\n", prop.sharedMemPerBlock); printf("regsPerBlock : %d.\n", prop.regsPerBlock); printf("warpSize : %d.\n", prop.warpSize); printf("memPitch : %ld.\n", prop.memPitch); printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock); printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("totalConstMem : %ld.\n", prop.totalConstMem); printf("major.minor : %d.%d.\n", prop.major, prop.minor); printf("clockRate : %d.\n", prop.clockRate); printf("textureAlignment : %ld.\n", prop.textureAlignment); printf("deviceOverlap : %d.\n", prop.deviceOverlap); printf("multiProcessorCount : %d.\n", prop.multiProcessorCount); } //CUDA 初始化 bool InitCUDA() { int count; //取得支持Cuda的装置的数目 cudaGetDeviceCount(&count); if (count == 0) { fprintf(stderr, "There is no device.\n"); return false; } int i; for (i = 0; i < count; i++) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, i); //打印设备信息 printDeviceProp(prop); if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if (prop.major >= 1) { break; } } } if (i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } void init(int num,int range,double a[],double offset){ for(int i=0;i<num;i++){ a[i] = (double)(rand()%(range*1000)/1000.0) - offset; } } void getM(int num,double a[],double m[]){ m[0] = m[1] = 0.0; for(int i=0;i<num;i++){ if(a[i]<m[0]){ m[0] = a[i]; }else if(a[i]>m[1]){ m[1] = a[i]; } } } void normalize(int num,double a[],double m[]){ for(int i =0;i<num;i++){ a[i] = (a[i]-m[0]+1)/(m[1]-m[0]+1); } } void renorm(int num,double a[],double m[]){ for(int i =0;i<num;i++){ a[i] = a[i]*(m[1]-m[0]+1) + m[0] - 1; } } void printArray(int num,double a[]){ for(int i=0;i<num;i++){ printf("%6.4lf ",a[i]); if((i+1)%10==0){ cout<<endl; } } } __global__ static void hidenLayer(double x1,double x2,double w1[],double w2[],double yh[]){ /*for(int i=0;i<NEURESIZE;i++){ yh[i] = w1[i]*x1 + w2[i]*x2; yh[i] = 1/(1+pow(E,0-yh[i])); }*/ const int tid = threadIdx.x; int i =tid; yh[i] = w1[i]*x1 + w2[i]*x2; yh[i] = 1/(1+pow(E,0-yh[i])); } double outLayer(double yh[],double v[]){ double y2; for(int i=0;i<NEURESIZE;i++){ y2 += yh[i] * v[i]; } y2 = 1/(1+pow(E,0-y2)); return y2; } __global__ static void update(double x1[],double x2[],double yh[],double v[],double w1[],double w2[],double *loss){ const int tid = threadIdx.x; int i = tid; /*for(int i=0;i<NEURESIZE;i++){ w1[i] += x1[i] * (1-x1[i]) * loss * RW; w2[i] += x2[i] * (1-x2[i]) * loss * RW; v[i] += yh[i] * loss * RW; }*/ w1[i] += x1[i] * (1-x1[i]) * (*loss) * RW; w2[i] += x2[i] * (1-x2[i]) * (*loss) * RW; v[i] += yh[i] * (*loss) * RW; } /*double test(double w1[],double w2[],double v[],double m1[],double m2[],double my[]){ double tx1[TESTSIZE],tx2[TESTSIZE],ty[TESTSIZE],tyh[NEURESIZE],ty2[TESTSIZE]; double avLoss = 0.0; init(TESTSIZE,10,tx1,0.0); init(TESTSIZE,10,tx2,0.0); for(int i=0;i<TESTSIZE;i++){ ty[i] = tx1[i] + tx2[i]; } normalize(TESTSIZE,tx1,m1); normalize(TESTSIZE,tx2,m2); for(int q=0;q<TESTSIZE;q++){ hidenLayer(tx1[q],tx2[q],w1,w2,tyh); ty2[q] = outLayer(tyh,v); } renorm(TESTSIZE,ty2,my); for(int i=0;i<TESTSIZE;i++){ if(i<10){ printf("%2d y=%2.4f y2=%2.4f\n",i,ty[i],ty2[i]); } avLoss += pow(ty[i]-ty2[i],2); } avLoss /= TESTSIZE; //cout<<avLoss<<endl; return avLoss; }*/ int main(){ ofstream outf; outf.open("trainloss.txt"); srand( (unsigned)time(NULL) ); long starttime = clock(); double x1[DATASIZE],x2[DATASIZE],y[DATASIZE],y2[DATASIZE]; double w1[NEURESIZE],w2[NEURESIZE],v[NEURESIZE],yh[NEURESIZE]; double m1[2],m2[2],my[2]; double cLoss,realLoss,minTrainLoss = 1.0,minTestLoss = 1.0; init(DATASIZE,10,x1,0.0); init(DATASIZE,10,x2,0.0); init(NEURESIZE,2,w1,1.0); init(NEURESIZE,2,w2,1.0); init(NEURESIZE,2,v,1.0); for(int i=0;i<DATASIZE;i++){ y[i] = x1[i] + x2[i]; } //CUDA 初始化 if (!InitCUDA()) { return 0; } //cudaMalloc 取得一块显卡内存 double *x1_g,*x2_g,*y_g,*y2_g; double *w1_g,*w2_g,*v_g,*yh_g; double *cLoss_g; cudaMalloc((void**)&x1_g, sizeof(double)* DATASIZE); cudaMalloc((void**)&x2_g, sizeof(double)* DATASIZE); cudaMalloc((void**)&y_g, sizeof(double)* DATASIZE); cudaMalloc((void**)&y2_g, sizeof(double)* DATASIZE); cudaMalloc((void**)&w1_g, sizeof(double)* NEURESIZE); cudaMalloc((void**)&w2_g, sizeof(double)* NEURESIZE); cudaMalloc((void**)&v_g, sizeof(double)* NEURESIZE); cudaMalloc((void**)&yh_g, sizeof(double)* NEURESIZE); cudaMalloc((void**)&cLoss_g, sizeof(double)); //cudaMemcpy 将产生的随机数复制到显卡内存中 //cudaMemcpyHostToDevice - 从内存复制到显卡内存 //cudaMemcpyDeviceToHost - 从显卡内存复制到内存 cudaMemcpy(w1_g,w1, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice); cudaMemcpy(w2_g,w2, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice); cudaMemcpy(v_g,v, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice); cudaMemcpy(x1_g,x1, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice); cudaMemcpy(x2_g,x2, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice); cudaMemcpy(y_g,y, sizeof(double)*DATASIZE, cudaMemcpyHostToDevice); cudaMemcpy(yh_g,yh, sizeof(double)*NEURESIZE, cudaMemcpyHostToDevice); cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice); getM(DATASIZE,x1,m1); getM(DATASIZE,x2,m2); getM(DATASIZE,y,my); normalize(DATASIZE,x1,m1); normalize(DATASIZE,x2,m2); normalize(DATASIZE,y,my); for(int j=0;j<EPOCH;j++){ double tLoss = 0.0; for(int i=0;i<DATASIZE;i++){ hidenLayer<< < 1, NEURESIZE, 0 >> >(x1_g[i],x2_g[i],w1_g,w2_g,yh_g); cudaMemcpy(yh,yh_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost); cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost); y2[i] = outLayer(yh,v); cLoss = y2[i] * (1-y2[i]) * (y[i]-y2[i]); cudaMemcpy(cLoss_g,&cLoss, sizeof(double), cudaMemcpyHostToDevice); update<< < 1, NEURESIZE, 0 >> >(x1_g,x2_g,yh_g,v_g,w1_g,w2_g,cLoss_g); cudaMemcpy(&cLoss,cLoss_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost); cLoss = pow(cLoss,2); cLoss = cLoss*(my[1]-my[0]+1); tLoss += cLoss; } tLoss /= DATASIZE; if(tLoss<minTrainLoss){ minTrainLoss = tLoss; } printf("EPOCH--%d, trainLoss--%0.4f\n",j,tLoss); outf<<j<<"\t"<<tLoss<<endl; /*cudaMemcpy(w1,w1_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost); cudaMemcpy(w2,w2_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost); cudaMemcpy(v,v_g, sizeof(double)*NEURESIZE, cudaMemcpyDeviceToHost); double avLoss = test(w1,w2,v,m1,m2,my); printf("EPOCH--%d, avLoss--%0.4f\n",j,avLoss); if(avLoss<minTestLoss){ minTestLoss = avLoss; }*/ cout<<"------------------"<<endl; } printf("minTrainLoss--%0.4f\n",minTrainLoss); //printf("minTestLoss--%0.4f\n",minTestLoss); outf.close(); //Free cudaFree(x1_g); cudaFree(x2_g); cudaFree(y_g); cudaFree(w1_g); cudaFree(w2_g); cudaFree(v_g); cudaFree(yh_g); cudaFree(cLoss_g); long endtime = clock()-starttime; float execution_time = (float)endtime / (1024 * 1058500); cout << "total time cost: " << execution_time<<endl; return 0; }