我的第一个CUDA程序——向量点乘(修正)

要求:计算两个1000个项的向量的点乘。

 

本来以为能很快写完的,结果跑出的答案是错的,后来对照着NVIDIA的示例,加上cutilCheckMsg("kernel launch failure");这句,运行到这里直接挂掉了。

然后我想了一会儿,尝试性的把1000改成了100,运行成功了,再翻一下书发现Block最多线程是512个,于是我内牛满面的在那里重重的画了几笔,这算是我遇到的第一个CUDA错误吧。果然够脑残,终身难忘。

接下来这个是直接对global memory操作的,按照题目的要求,我把blockMax在2和500之间改了N次,但运行时间差别不大,都在62和78这两个值之间徘徊(release版),只是有时blockMax=2会突然跳到31,我想不通为什么。当然用clock()有些粗略,只记录kernel调用的话直接得到的值为0了。第二天又看了书,才发觉应该用Event的,于是把代码修改了一下,现在能显示比较精确的时间了。2和500,依然是不大。都在0.0134左右

#include <stdio.h>
#include <time.h>
#include <cutil_inline.h>

float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
__global__ void DotMulVet(const float* A,const float* B,float* C,int N)
{
    int i=blockIdx.x*blockDim.x+threadIdx.x;
    if(i<N)
        C[i]=A[i]*B[i];
}
int main()
{
    int N=1000;
    int i;
    int mem_size=sizeof(float)*N;
    cudaEvent_t   start, finish;
    cutilSafeCall(cudaEventCreate(&start));
    cutilSafeCall(cudaEventCreate(&finish));
    printf("Start build\n");
        
    h_A=(float*)malloc(mem_size);
    h_B=(float*)malloc(mem_size);
    h_C=(float*)malloc(mem_size);
    for(i=0;i<N;i++)
    {
        h_A[i]=(float)(rand()/RAND_MAX);
        h_B[i]=(float)(rand()/RAND_MAX);
    }
    cutilSafeCall(cudaMalloc((void**)&d_A,mem_size));
    cutilSafeCall(cudaMalloc((void**)&d_B,mem_size));
    cutilSafeCall(cudaMalloc((void**)&d_C,mem_size));
    cutilSafeCall(cudaMemcpy(d_A,h_A,mem_size,cudaMemcpyHostToDevice));
    cutilSafeCall(cudaMemcpy(d_B,h_B,mem_size,cudaMemcpyHostToDevice));
   
    printf("Start compute\n");
    int blockMax=500;
    int blockNum=(N+blockMax-1)/blockMax
    cudaEventRecord(start,0);
    DotMulVet<<< blockNum , blockMax>>>(d_A,d_B,d_C,N);    //为?分?配?共?享?内?存?设?置?Ns
    cutilCheckMsg("kernel launch failure");
    //cutilSafeCall( cudaThreadSynchronize() );
    cudaEventRecord(finish,0);
    cudaEventSynchronize(finish);
    cutilSafeCall(cudaMemcpy(h_C,d_C,mem_size,cudaMemcpyDeviceToHost));
   
    for(i=0;i<N;i++)
    {
        float ans=h_A[i]*h_B[i];
        if(fabs(ans-h_C[i])>1E-6)
            break;
    printf("Result: %s\n",(i==N)?"Correct":"Wrong");
    float costTime;
    cutilSafeCall(cudaEventElapsedTime(&costTime,start,finish));
    printf("Cost Time : %f\n",costTime);
    getchar();

    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

}

题目还要求对使用shared memory和global memory做对比,于是我又改了一下DotMulVet函数,这下能看出差别了,但时间反而增加了。500是0.0135,2是0.0225。由于在device code中,对A,B,C的访问都只有一次,所以使用shared memory是多此一举的。所以时间有所增加。每个块线程为2的话,需要更多的块,耗费就更大了。(个人推测)

#include <stdio.h>
#include <time.h>
#include <cutil_inline.h>

float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;

__global__ void DotMulVet(const float* A,const float* B,float* C,int N)
{
    extern __shared__ float sdata[];    //shared memory

    int i=blockIdx.x*blockDim.x+threadIdx.x;

    sdata[threadIdx.x]=A[i];
    sdata[threadIdx.x+N]=B[i];

    if(i<N)
        sdata[threadIdx.x]=sdata[threadIdx.x]*sdata[threadIdx.x+N];
    C[i]=sdata[threadIdx.x];
}

int main()
{
    int N=1000;
    int i;
    int mem_size=sizeof(float)*N;
    cudaEvent_t   start, finish;
    cutilSafeCall(cudaEventCreate(&start));
    cutilSafeCall(cudaEventCreate(&finish));
    printf("Start build\n");
    
    
    h_A=(float*)malloc(mem_size);
    h_B=(float*)malloc(mem_size);
    h_C=(float*)malloc(mem_size);
    
    for(i=0;i<N;i++)
    {
        h_A[i]=(float)(rand()/RAND_MAX);
        h_B[i]=(float)(rand()/RAND_MAX);
    }
    
    cutilSafeCall(cudaMalloc((void**)&d_A,mem_size));
    cutilSafeCall(cudaMalloc((void**)&d_B,mem_size));
    cutilSafeCall(cudaMalloc((void**)&d_C,mem_size));
    
    cutilSafeCall(cudaMemcpy(d_A,h_A,mem_size,cudaMemcpyHostToDevice));
    cutilSafeCall(cudaMemcpy(d_B,h_B,mem_size,cudaMemcpyHostToDevice));
    
    printf("Start compute\n");
    
    int blockMax=2;
    int blockNum=(N+blockMax-1)/blockMax;
    
    cudaEventRecord(start,0);
    DotMulVet<<< blockNum , blockMax , 2*mem_size>>>(d_A,d_B,d_C,N);    //为?分?配?共?享?内?存?设?置?Ns
    cutilCheckMsg("kernel launch failure");
    
    //cutilSafeCall( cudaThreadSynchronize() );
    cudaEventRecord(finish,0);
    cudaEventSynchronize(finish);
    
    cutilSafeCall(cudaMemcpy(h_C,d_C,mem_size,cudaMemcpyDeviceToHost));
    
    for(i=0;i<N;i++)
    {
        float ans=h_A[i]*h_B[i];
        if(fabs(ans-h_C[i])>1E-6)
            break;
    }
    
    printf("Result: %s\n",(i==N)?"Correct":"Wrong");
    float costTime;
    cutilSafeCall(cudaEventElapsedTime(&costTime,start,finish));

    printf("Cost Time : %f\n",costTime);
    getchar();
    
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

}

本文原创,转载请注明出处:

http://blog.163.com/sara_athena/ 或

http://www.cnblogs.com/luluathena/

posted @ 2010-08-28 15:16  筱夏  阅读(3941)  评论(0编辑  收藏  举报