


  •         指针遍历

OpenCL不支持CUDA那样的指针遍历方式, 你只能用下标方式间接实现指针遍历. 例子代码如下:

struct Node { Node* next; }
n = n->next;

 // OpenCL

struct Node { unsigned int next; }

n = bufBase + n;

  • Kernel 程序异同


使用伪代码,程序运行时即时编译和装载。这个类似JAVA, .net 程序,道理也一样,为了支持跨平台的兼容。kernel程序的语法也


  1. __global__ void vectorAdd(const float * a, const float * b, float * c)  
  2. // CUDA  
  3.     int nIndex = blockIdx.x * blockDim.x + threadIdx.x;  
  4.     c[nIndex] = a[nIndex] + b[nIndex];  
  5. }  


  1. __kernel void vectorAdd(__global const float * a, __global const float * b, __global float * c)  
  2. // OpenCL  
  3.     int nIndex = get_global_id(0);  
  4.     c[nIndex] = a[nIndex] + b[nIndex];  
  5. }  






1)CUDA 的kernel函数使用“__global__”申明而OpenCL的kernel函数使用“__kernel”作为申明。


3)众所周知,CUDA采用threadIdx.{x|y|z}, blockIdx.{x|y|z}来获得当前线程的索引号,而OpenCL



  • Host代码的异同



  1. const unsigned int cnBlockSize = 512;  
  2. const unsigned int cnBlocks = 3;  
  3. const unsigned int cnDimension = cnBlocks * cnBlockSize;  
  4. CUdevice hDevice;  
  5. CUcontext hContext;  
  6. CUmodule hModule;  
  7. CUfunction hFunction;  
  8. // create CUDA device & context  
  9. cuInit(0);  
  10. cuDeviceGet(&hContext, 0); // pick first device  
  11. cuCtxCreate(&hContext, 0, hDevice));  
  12. cuModuleLoad(&hModule, “vectorAdd.cubin”);  
  13. cuModuleGetFunction(&hFunction, hModule, "vectorAdd");  
  14. // allocate host vectors  
  15. float * pA = new float[cnDimension];  
  16. float * pB = new float[cnDimension];  
  17. float * pC = new float[cnDimension];  
  18. // initialize host memory  
  19. randomInit(pA, cnDimension);  
  20. randomInit(pB, cnDimension);  
  21. // allocate memory on the device  
  22. CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;  
  23. cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));  
  24. cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));  
  25. cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));  
  26. // copy host vectors to device  
  27. cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));  
  28. cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));  
  29. // setup parameter values  
  30. cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1);  
  31. cuParamSeti(cuFunction, 0, pDeviceMemA);  
  32. cuParamSeti(cuFunction, 4, pDeviceMemB);  
  33. cuParamSeti(cuFunction, 8, pDeviceMemC);  
  34. cuParamSetSize(cuFunction, 12);  
  35. // execute kernel  
  36. cuLaunchGrid(cuFunction, cnBlocks, 1);  
  37. // copy the result from device back to host  
  38. cuMemcpyDtoH((void *) pC, pDeviceMemC, cnDimension * sizeof(float));  
  39. delete[] pA;  
  40. delete[] pB;  
  41. delete[] pC;  
  42. cuMemFree(pDeviceMemA);  
  43. cuMemFree(pDeviceMemB);  
  44. cuMemFree(pDeviceMemC);  




OpenCL的代码以文本方式存放在“sProgramSource”。 调用方式如下:


  1. const unsigned int cnBlockSize = 512;  
  2. const unsigned int cnBlocks = 3;  
  3. const unsigned int cnDimension = cnBlocks * cnBlockSize;  
  4. // create OpenCL device & context  
  5. cl_context hContext;  
  6. hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);  
  7. // query all devices available to the context  
  8. size_t nContextDescriptorSize;  
  9. clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize);  
  10. cl_device_id * aDevices = malloc(nContextDescriptorSize);  
  11. clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);  
  12. // create a command queue for first device the context reported  
  13. cl_command_queue hCmdQueue;  
  14. hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);  
  15. // create & compile program  
  16. cl_program hProgram;  
  17. hProgram = clCreateProgramWithSource(hContext, 1, sProgramSource, 0, 0);  
  18. clBuildProgram(hProgram, 0, 0, 0, 0, 0);// create kernel  
  19. cl_kernel hKernel;  
  20. hKernel = clCreateKernel(hProgram, “vectorAdd”, 0);  
  21. // allocate host vectors  
  22. float * pA = new float[cnDimension];  
  23. float * pB = new float[cnDimension];  
  24. float * pC = new float[cnDimension];  
  25. // initialize host memory  
  26. randomInit(pA, cnDimension);  
  27. randomInit(pB, cnDimension);  
  28. // allocate device memory  
  29. cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;  
  30. hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);  
  31. hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);  
  32. hDeviceMemC = clCreateBuffer(hContext,  
  34. cnDimension * sizeof(cl_float), 0, 0);  
  35. // setup parameter values  
  36. clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);  
  37. clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);  
  38. clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);  
  39. // execute kernel  
  40. clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, 0, &cnDimension, 0, 0, 0, 0);  
  41. // copy results from device back to host  
  42. clEnqueueReadBuffer(hContext, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),  
  43. pC, 0, 0, 0);  
  44. delete[] pA;  
  45. delete[] pB;  
  46. delete[] pC;  
  47. clReleaseMemObj(hDeviceMemA);  
  48. clReleaseMemObj(hDeviceMemB);  
  49. clReleaseMemObj(hDeviceMemC);  




  • 初始化部分的异同 

CUDA 在使用任何API之前必须调用cuInit(0),然后是获得当前系统的可用设备并获得Context。
cuDeviceGet(&hContext, 0);
cuCtxCreate(&hContext, 0, hDevice));
cl_context hContext;
hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);
size_t nContextDescriptorSize;
clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize);
cl_device_id * aDevices = malloc(nContextDescriptorSize);

clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);

OpenCL introduces an additional concept: Command Queues. Commands launching kernels and
reading or writing memory are always issued for a specific command queue. A command queue is
created on a specific device in a context. The following code creates a command queue for the
device and context created so far:
cl_command_queue hCmdQueue;
hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);
With this the program has progressed to the point where data can be uploaded to the device’s
memory and processed by launching compute kernels on the device.

  • Kernel Creation

CUDA kernel 以二进制格式存放与CUBIN文件中间,其调用格式和DLL的用法比较类似,先装载二进制库,然后通过函数名查找

CUmodule hModule;
cuModuleLoad(&hModule, “vectorAdd.cubin”);
cuModuleGetFunction(&hFunction, hModule, "vectorAdd");
OpenCL 为了支持多平台,所以不使用编译后的代码,采用类似JAVA的方式,装载文本格式的代码文件,然后即时编译并运行。

// 装载代码,即时编译
cl_program hProgram;
hProgram = clCreateProgramWithSource(hContext, 1, “vectorAdd.c", 0, 0);
clBuildProgram(hProgram, 0, 0, 0, 0, 0);
// 获得kernel函数句柄
cl_kernel hKernel;
hKernel = clCreateKernel(hProgram, “vectorAdd”, 0);


  • 设备内存分配

内存分配没有什么大区别,OpenCL提供两组特殊的标志,CL_MEM_READ_ONLY  和 CL_MEM_WRITE_ONLY 用来控制内存

的读写权限。另外一个标志比较有用:CL_MEM_COPY_HOST_PTR 表示这个内存在主机分配,但是GPU可以使用,运行时会自动


CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;
cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));
// OpenCL
hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemC = clCreateBuffer(hContext, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);

  • Kernel Parameter Specification

The next step in preparing the kernels for launch is to establish a mapping between the kernels’
parameters, essentially pointers to the three vectors A, B and C, to the three device memory regions,
which were allocated in the previous section.
Parameter setting in both APIs is a pretty low-level affair. It requires knowledge of the total number
, order, and types of a given kernel’s parameters. The order and types of the parameters are used to
determine a specific parameters offset inside the data block made up of all parameters. The offset in
bytes for the n-th parameter is essentially the sum of the sizes of all (n-1) preceding parameters.
Using the CUDA Driver API:
In CUDA device pointers are represented as unsigned int and the CUDA Driver API has a
dedicated method for setting that type. Here’s the code for setting the three parameters. Note how
the offset is incrementally computed as the sum of the previous parameters’ sizes.
cuParamSeti(cuFunction, 0, pDeviceMemA);
cuParamSeti(cuFunction, 4, pDeviceMemB);
cuParamSeti(cuFunction, 8, pDeviceMemC);
cuParamSetSize(cuFunction, 12);
Using OpenCL:
In OpenCL parameter setting is done via a single function that takes a pointer to the location of the
parameter to be set.
clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);

