我的第一个CUDA程序——向量点乘(修正)
要求:计算两个1000个项的向量的点乘。
本来以为能很快写完的,结果跑出的答案是错的,后来对照着NVIDIA的示例,加上cutilCheckMsg("kernel launch failure");这句,运行到这里直接挂掉了。
然后我想了一会儿,尝试性的把1000改成了100,运行成功了,再翻一下书发现Block最多线程是512个,于是我内牛满面的在那里重重的画了几笔,这算是我遇到的第一个CUDA错误吧。果然够脑残,终身难忘。
接下来这个是直接对global memory操作的,按照题目的要求,我把blockMax在2和500之间改了N次,但运行时间差别不大,都在62和78这两个值之间徘徊(release版),只是有时blockMax=2会突然跳到31,我想不通为什么。当然用clock()有些粗略,只记录kernel调用的话直接得到的值为0了。第二天又看了书,才发觉应该用Event的,于是把代码修改了一下,现在能显示比较精确的时间了。2和500,依然是不大。都在0.0134左右
#include <stdio.h>
#include <time.h>
#include <cutil_inline.h> float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
__global__ void DotMulVet(const float* A,const float* B,float* C,int N)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
if(i<N)
C[i]=A[i]*B[i];
}
int main()
{
int N=1000;
int i;
int mem_size=sizeof(float)*N;
cudaEvent_t start, finish;
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&finish));
printf("Start build\n"); h_A=(float*)malloc(mem_size);
h_B=(float*)malloc(mem_size);
h_C=(float*)malloc(mem_size);
for(i=0;i<N;i++)
{
h_A[i]=(float)(rand()/RAND_MAX);
h_B[i]=(float)(rand()/RAND_MAX);
}
cutilSafeCall(cudaMalloc((void**)&d_A,mem_size));
cutilSafeCall(cudaMalloc((void**)&d_B,mem_size));
cutilSafeCall(cudaMalloc((void**)&d_C,mem_size));
cutilSafeCall(cudaMemcpy(d_A,h_A,mem_size,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(d_B,h_B,mem_size,cudaMemcpyHostToDevice));
printf("Start compute\n");
int blockMax=500;
int blockNum=(N+blockMax-1)/blockMax;
cudaEventRecord(start,0);
DotMulVet<<< blockNum , blockMax>>>(d_A,d_B,d_C,N); //为?分?配?共?享?内?存?设?置?Ns
cutilCheckMsg("kernel launch failure");
//cutilSafeCall( cudaThreadSynchronize() );
cudaEventRecord(finish,0);
cudaEventSynchronize(finish);
cutilSafeCall(cudaMemcpy(h_C,d_C,mem_size,cudaMemcpyDeviceToHost));
for(i=0;i<N;i++)
{
float ans=h_A[i]*h_B[i];
if(fabs(ans-h_C[i])>1E-6)
break;
}
printf("Result: %s\n",(i==N)?"Correct":"Wrong");
float costTime;
cutilSafeCall(cudaEventElapsedTime(&costTime,start,finish));
printf("Cost Time : %f\n",costTime);
getchar();
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
题目还要求对使用shared memory和global memory做对比,于是我又改了一下DotMulVet函数,这下能看出差别了,但时间反而增加了。500是0.0135,2是0.0225。由于在device code中,对A,B,C的访问都只有一次,所以使用shared memory是多此一举的。所以时间有所增加。每个块线程为2的话,需要更多的块,耗费就更大了。(个人推测)
#include <stdio.h> #include <time.h> #include <cutil_inline.h> float* h_A; float* h_B; float* h_C; float* d_A; float* d_B; float* d_C; __global__ void DotMulVet(const float* A,const float* B,float* C,int N) { extern __shared__ float sdata[]; //shared memory int i=blockIdx.x*blockDim.x+threadIdx.x; sdata[threadIdx.x]=A[i]; sdata[threadIdx.x+N]=B[i]; if(i<N) sdata[threadIdx.x]=sdata[threadIdx.x]*sdata[threadIdx.x+N]; C[i]=sdata[threadIdx.x]; } int main() { int N=1000; int i; int mem_size=sizeof(float)*N; cudaEvent_t start, finish; cutilSafeCall(cudaEventCreate(&start)); cutilSafeCall(cudaEventCreate(&finish)); printf("Start build\n"); h_A=(float*)malloc(mem_size); h_B=(float*)malloc(mem_size); h_C=(float*)malloc(mem_size); for(i=0;i<N;i++) { h_A[i]=(float)(rand()/RAND_MAX); h_B[i]=(float)(rand()/RAND_MAX); } cutilSafeCall(cudaMalloc((void**)&d_A,mem_size)); cutilSafeCall(cudaMalloc((void**)&d_B,mem_size)); cutilSafeCall(cudaMalloc((void**)&d_C,mem_size)); cutilSafeCall(cudaMemcpy(d_A,h_A,mem_size,cudaMemcpyHostToDevice)); cutilSafeCall(cudaMemcpy(d_B,h_B,mem_size,cudaMemcpyHostToDevice)); printf("Start compute\n"); int blockMax=2; int blockNum=(N+blockMax-1)/blockMax; cudaEventRecord(start,0); DotMulVet<<< blockNum , blockMax , 2*mem_size>>>(d_A,d_B,d_C,N); //为?分?配?共?享?内?存?设?置?Ns cutilCheckMsg("kernel launch failure"); //cutilSafeCall( cudaThreadSynchronize() ); cudaEventRecord(finish,0); cudaEventSynchronize(finish); cutilSafeCall(cudaMemcpy(h_C,d_C,mem_size,cudaMemcpyDeviceToHost)); for(i=0;i<N;i++) { float ans=h_A[i]*h_B[i]; if(fabs(ans-h_C[i])>1E-6) break; } printf("Result: %s\n",(i==N)?"Correct":"Wrong"); float costTime; cutilSafeCall(cudaEventElapsedTime(&costTime,start,finish)); printf("Cost Time : %f\n",costTime); getchar(); free(h_A); free(h_B); free(h_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); }
本文原创,转载请注明出处: