GPGPU OpenCL Reduction操作与group同步
Reduction操作:规约操作就是由多个数生成一个数,如求最大值、最小值、向量点积、求和等操作,都属于这一类操作。
有大量数据的情况下,使用GPU进行任务并行与数据并行,可以收到可好的效果。
group同步:OpenCL只提供了工作组内的各线程之间的同步机制,并没有提供所有线程的同步。提供组内item-work同步的方法:
void barrier (cl_mem_fence_flags flags)
参数说明:cl_mem_fence_flags 可以取CLK_LOCAL_MEM_FENCE、CLK_GLOBAL_MEM_FENCE
函数说明:(1)一个work-group中所有work-item遇到barrier方法,都要等待其他work-item也到达该语句,才能执行后面的程序;
(2)还可以组内的work-item对local or global memory的顺序读写操作。
如下图中每个大框表示任务并行、每个group线程;框中的计算是数据并行、每个item-work线程:
作为练习,给出个完整的使用OpenCL计算整数序列求和,在数据并行中使用Local Memory 加速,group组内并行同步使用CLK_LOCAL_MEM_FENCE。
程序实例(整数序列求和):
1.核函数(Own_Reduction_Kernels.cl):
1 __kernel 2 void 3 reduce(__global uint4* input, __global uint4* output, int NUM) 4 { 5 NUM = NUM / 4; //每四个数为一个整体uint4。 6 unsigned int tid = get_local_id(0); 7 unsigned int localSize = get_local_size(0); 8 unsigned int globalSize = get_global_size(0); 9 10 uint4 res=(uint4){0,0,0,0}; 11 __local uint4 resArray[64]; 12 13 14 unsigned int i = get_global_id(0); 15 while(i < NUM) 16 { 17 res+=input[i]; 18 i+=globalSize; 19 } 20 resArray[tid]=res; //将每个work-item计算结果保存到对应__local memory中 21 barrier(CLK_LOCAL_MEM_FENCE); 22 23 // do reduction in shared mem 24 for(unsigned int s = localSize >> 1; s > 0; s >>= 1) 25 { 26 if(tid < s) 27 { 28 resArray[tid] += resArray[tid + s]; 29 } 30 barrier(CLK_LOCAL_MEM_FENCE); 31 } 32 33 // write result for this block to global mem 34 if(tid == 0) 35 output[get_group_id(0)] = resArray[0]; 36 }
2.tool.h 、tool.cpp
见:http://www.cnblogs.com/xudong-bupt/p/3582780.html
3.Reduction.cpp
1 #include <CL/cl.h> 2 #include "tool.h" 3 #include <string.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <iostream> 7 #include <string> 8 #include <fstream> 9 using namespace std; 10 11 int isVerify(int NUM,int groupNUM,int *res) //校验结果 12 { 13 int sum1 = (NUM+1)*NUM/2; 14 int sum2 = 0; 15 for(int i = 0;i < groupNUM*4; i++) 16 sum2 += res[i]; 17 if(sum1 == sum2) 18 return 0; 19 return -1; 20 } 21 22 void isStatusOK(cl_int status) //判断状态码 23 { 24 if(status == CL_SUCCESS) 25 cout<<"RIGHT"<<endl; 26 else 27 cout<<"ERROR"<<endl; 28 } 29 30 int main(int argc, char* argv[]) 31 { 32 cl_int status; 33 /**Step 1: Getting platforms and choose an available one(first).*/ 34 cl_platform_id platform; 35 getPlatform(platform); 36 37 /**Step 2:Query the platform and choose the first GPU device if has one.*/ 38 cl_device_id *devices=getCl_device_id(platform); 39 40 /**Step 3: Create context.*/ 41 cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL); 42 43 /**Step 4: Creating command queue associate with the context.*/ 44 cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); 45 46 /**Step 5: Create program object */ 47 const char *filename = "Own_Reduction_Kernels.cl"; 48 string sourceStr; 49 status = convertToString(filename, sourceStr); 50 const char *source = sourceStr.c_str(); 51 size_t sourceSize[] = {strlen(source)}; 52 cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL); 53 54 /**Step 6: Build program. */ 55 status=clBuildProgram(program, 1,devices,NULL,NULL,NULL); 56 57 /**Step 7: Initial input,output for the host and create memory objects for the kernel*/ 58 int NUM=25600; //6400*4 59 size_t global_work_size[1] = {640}; /// 60 size_t local_work_size[1]={64}; ///256 PE 61 size_t groupNUM=global_work_size[0]/local_work_size[0]; 62 int* input = new int[NUM]; 63 for(int i=0;i<NUM;i++) 64 input[i]=i+1; 65 int* output = new int[(global_work_size[0]/local_work_size[0])*4]; 66 67 cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(int),(void *) input, NULL); 68 cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , groupNUM*4* sizeof(int), NULL, NULL); 69 70 /**Step 8: Create kernel object */ 71 cl_kernel kernel = clCreateKernel(program,"reduce", NULL); 72 73 /**Step 9: Sets Kernel arguments.*/ 74 status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); 75 status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer); 76 status = clSetKernelArg(kernel, 2, sizeof(int), &NUM); 77 78 /**Step 10: Running the kernel.*/ 79 cl_event enentPoint; 80 status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, &enentPoint); 81 clWaitForEvents(1,&enentPoint); ///wait 82 clReleaseEvent(enentPoint); 83 isStatusOK(status); 84 85 /**Step 11: Read the cout put back to host memory.*/ 86 status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0,groupNUM*4 * sizeof(int), output, 0, NULL, NULL); 87 isStatusOK(status); 88 if(isVerify(NUM, groupNUM ,output) == 0) 89 cout<<"The result is right!!!"<<endl; 90 else 91 cout<<"The result is wrong!!!"<<endl; 92 93 /**Step 12: Clean the resources.*/ 94 status = clReleaseKernel(kernel);//*Release kernel. 95 status = clReleaseProgram(program); //Release the program object. 96 status = clReleaseMemObject(inputBuffer);//Release mem object. 97 status = clReleaseMemObject(outputBuffer); 98 status = clReleaseCommandQueue(commandQueue);//Release Command queue. 99 status = clReleaseContext(context);//Release context. 100 101 free(input); 102 free(output); 103 free(devices); 104 return 0; 105 }