slean

 

Opencl 并行求和

上周尝试用opencl求极大值,在网上查到大多是求和,所谓的reduction算法。不过思路是一样的。

CPP:

   int err = 0;
    unsigned long int nNumCount = 102400000;
    int nLocalSize = 256;
    int nGroupSize = 102400;
    int nGroup = nGroupSize / nLocalSize;

    int* pArray = new int[nNumCount];
    unsigned long int nReal = 0;
    int nStart = GetTickCount();
    for (int i=0;i<nNumCount;++i)
    {
        pArray[i] = i*2;
        nReal += pArray[i];
    }
    cout<<GetTickCount() - nStart<<endl;

    cl_mem clmemArray = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nNumCount,NULL,NULL);
    err = clEnqueueWriteBuffer(queue,clmemArray,CL_TRUE,0,sizeof(int)*nNumCount,pArray,0,0,0);
    cl_mem clmemRes  = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nGroup,NULL,NULL);
    
    nStart = GetTickCount();

    err = clSetKernelArg(m_KerCalcRay,0,sizeof(cl_mem),&clmemArray);
    err = clSetKernelArg(m_KerCalcRay,1,sizeof(cl_mem),&clmemRes);
    err = clSetKernelArg(m_KerCalcRay,2,sizeof(int)*nLocalSize,0);
    err = clSetKernelArg(m_KerCalcRay,3,sizeof(int),&nNumCount);

    
    size_t localws[1] = {nLocalSize};
    size_t globalws[1] = {nGroupSize};

    err = clEnqueueNDRangeKernel(queue,m_KerCalcRay,1,NULL,globalws,localws,0,NULL,NULL);
    clFinish(queue);

    int* pRes = new int[nGroup];
    err = clEnqueueReadBuffer(queue,clmemRes,CL_TRUE,0,sizeof(int)*nGroup,pRes,0,0,0);
    clFinish(queue);

    unsigned long int nRes = 0;
    for(int i=0;i<nGroup;++i)
    {
        nRes += pRes[i];
    }
  assert(nRes == nReal);

kernel:

__kernel void ReduceSum(__global int* num,__global int* res,__local int* pData,int nCount)
{
    unsigned int tid = get_local_id(0); 
    unsigned int bid = get_group_id(0); 
    unsigned int gid = get_global_id(0);
    unsigned int localSize = get_local_size(0); 
    unsigned int globalSize = get_global_size(0);
    
    
    int nRes = 0;
    while(gid < nCount)
    {
        nRes += num[gid];
        gid += globalSize;
    }
    pData[tid] = nRes;    
    barrier(CLK_LOCAL_MEM_FENCE);
    
     // do reduction in shared mem
    for(unsigned int s = localSize >> 1; s > 0; s >>= 1) 
    {
         if(tid < s) 
         {
             pData[tid] += pData[tid + s];
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
    
    if(tid == 0)
        res[bid] =     pData[0];

}

Reduction求和是这样一种方法,比如8个数0到7依次存放,求和的时候就是下标0和4、1和5、2和6、3和7,求和结果放到下标0、1、2、3中(同步一把barrier(CLK_LOCAL_MEM_FENCE))。然后继续就是0和2,、1和3求和结果放到0、1中。如此往复、最终结果就放到下标0中啦。

另:我试过循环展开减少同步次数、不过效率增长微乎其微。

posted on 2014-06-05 12:38  slean  阅读(1156)  评论(0编辑  收藏  举报

导航