opencl(二十五)----双调排序
参考:《opencl实战》
双调排序
一个序列:进行升序排列
6 1 4 5 7 2 3 8
a、左右两部分别 升序、降序
1 4 5 6 8 7 3 2
b 、左右度应位置比较,小的左移
1 4 3 2 8 7 5 6
c、左右都整成升序
1 2 3 4 5 6 7 8
注:四个元素如何排序
opencl 中可用代码如下:
uint4 mask1 = (uint4)(1, 0, 3, 2);
uint4 mask2 = (uint4)(2, 3, 0, 1);
uint4 mask3 = (uint4)(3, 2, 1, 0);
int4 add1 = (int4)(1, 1, 3, 3);
int4 add2 = (int4)(2, 3, 2, 3);
int4 add3 = (int4)(1, 2, 2, 3);
int4 add4 = (int4)(4, 5, 6, 7);
// float4* : input input1 input2 // dir : 0升 -1降 #define UP 0
#define DOWN -1
/* Sort elements in a vector */
#define SORT_VECTOR(input, dir) \
comp = input < shuffle(input, mask1) ^ dir; \
input = shuffle(input, as_uint4(comp + add1)); \
comp = input < shuffle(input, mask2) ^ dir; \
input = shuffle(input, as_uint4(comp * 2 + add2)); \
comp = input < shuffle(input, mask3) ^ dir; \
input = shuffle(input, as_uint4(comp + add3)); \
/* Sort elements between two vectors */
#define SWAP_VECTORS(input1, input2, dir) \
temp = input1; \
comp = (input1 < input2 ^ dir) * 4 + add4; \
input1 = shuffle2(input1, input2, as_uint4(comp)); \
input2 = shuffle2(input2, temp, as_uint4(comp)); \
demo: 对8个数值排序

1 /**************************** kernel *******************************/ 2 #define UP 0 3 #define DOWN -1 4 5 /* Sort elements in a vector */ 6 #define SORT_VECTOR(input, dir) \ 7 comp = input < shuffle(input, mask1) ^ dir; \ 8 input = shuffle(input, as_uint4(comp + add1)); \ 9 comp = input < shuffle(input, mask2) ^ dir; \ 10 input = shuffle(input, as_uint4(comp * 2 + add2)); \ 11 comp = input < shuffle(input, mask3) ^ dir; \ 12 input = shuffle(input, as_uint4(comp + add3)); \ 13 14 /* Sort elements between two vectors */ 15 #define SWAP_VECTORS(input1, input2, dir) \ 16 temp = input1; \ 17 comp = (input1 < input2 ^ dir) * 4 + add4; \ 18 input1 = shuffle2(input1, input2, as_uint4(comp)); \ 19 input2 = shuffle2(input2, temp, as_uint4(comp)); \ 20 21 __kernel void bsort8(__global float4 *data, int dir) { 22 23 float4 input1, input2, temp; 24 int4 comp; 25 26 uint4 mask1 = (uint4)(1, 0, 3, 2); 27 uint4 mask2 = (uint4)(2, 3, 0, 1); 28 uint4 mask3 = (uint4)(3, 2, 1, 0); 29 30 int4 add1 = (int4)(1, 1, 3, 3); 31 int4 add2 = (int4)(2, 3, 2, 3); 32 int4 add3 = (int4)(1, 2, 2, 3); 33 int4 add4 = (int4)(4, 5, 6, 7); 34 35 input1 = data[0]; 36 input2 = data[1]; 37 38 SORT_VECTOR(input1, UP) 39 SORT_VECTOR(input2, DOWN) 40 41 SWAP_VECTORS(input1, input2, dir) 42 43 SORT_VECTOR(input1, dir) 44 SORT_VECTOR(input2, dir) 45 46 data[0] = input1; 47 data[1] = input2; 48 } 49 50 51 /************************************** 主机程序 ***************************************/ 52 #define _CRT_SECURE_NO_WARNINGS 53 #define PROGRAM_FILE "bsort8.cl" 54 #define KERNEL_FUNC "bsort8" 55 56 #define ASCENDING 0 57 #define DESCENDING -1 58 59 #include <stdio.h> 60 #include <stdlib.h> 61 #include <string.h> 62 #include <time.h> 63 64 #ifdef MAC 65 #include <OpenCL/cl.h> 66 #else 67 #include <CL/cl.h> 68 #endif 69 70 /* Find a GPU or CPU associated with the first available platform */ 71 cl_device_id create_device() { 72 73 cl_platform_id platform; 74 cl_device_id dev; 75 int err; 76 77 /* Identify a platform */ 78 err = clGetPlatformIDs(1, &platform, NULL); 79 if(err < 0) { 80 perror("Couldn't identify a platform"); 81 exit(1); 82 } 83 84 /* Access a device */ 85 err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); 86 if(err == CL_DEVICE_NOT_FOUND) { 87 err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); 88 } 89 if(err < 0) { 90 perror("Couldn't access any devices"); 91 exit(1); 92 } 93 94 return dev; 95 } 96 97 /* Create program from a file and compile it */ 98 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { 99 100 cl_program program; 101 FILE *program_handle; 102 char *program_buffer, *program_log; 103 size_t program_size, log_size; 104 int err; 105 106 /* Read program file and place content into buffer */ 107 program_handle = fopen(filename, "r"); 108 if(program_handle == NULL) { 109 perror("Couldn't find the program file"); 110 exit(1); 111 } 112 fseek(program_handle, 0, SEEK_END); 113 program_size = ftell(program_handle); 114 rewind(program_handle); 115 program_buffer = (char*)malloc(program_size + 1); 116 program_buffer[program_size] = '\0'; 117 fread(program_buffer, sizeof(char), program_size, program_handle); 118 fclose(program_handle); 119 120 /* Create program from file */ 121 program = clCreateProgramWithSource(ctx, 1, 122 (const char**)&program_buffer, &program_size, &err); 123 if(err < 0) { 124 perror("Couldn't create the program"); 125 exit(1); 126 } 127 free(program_buffer); 128 129 /* Build program */ 130 err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 131 if(err < 0) { 132 133 /* Find size of log and print to std output */ 134 clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 135 0, NULL, &log_size); 136 program_log = (char*) malloc(log_size + 1); 137 program_log[log_size] = '\0'; 138 clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 139 log_size + 1, program_log, NULL); 140 printf("%s\n", program_log); 141 free(program_log); 142 exit(1); 143 } 144 145 return program; 146 } 147 148 int main() { 149 150 /* Host/device data structures */ 151 cl_device_id device; 152 cl_context context; 153 cl_command_queue queue; 154 cl_program program; 155 cl_kernel kernel; 156 cl_int i, err, dir, check; 157 158 /* Data and buffers */ 159 float data[8]; 160 cl_mem data_buffer; 161 162 /* Initialize data */ 163 data[0] = 3.0f; data[1] = 5.0f; data[2] = 4.0f; data[3] = 6.0f; 164 data[4] = 0.0f; data[5] = 7.0f; data[6] = 2.0f; data[7] = 1.0f; 165 printf("Input: %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f\n", 166 data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); 167 168 /* Create a device and context */ 169 device = create_device(); 170 context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); 171 if(err < 0) { 172 perror("Couldn't create a context"); 173 exit(1); 174 } 175 176 /* Create a kernel */ 177 program = build_program(context, device, PROGRAM_FILE); 178 kernel = clCreateKernel(program, KERNEL_FUNC, &err); 179 if(err < 0) { 180 perror("Couldn't create a kernel"); 181 exit(1); 182 }; 183 184 /* Create buffer */ 185 data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | 186 CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err); 187 if(err < 0) { 188 perror("Couldn't create a buffer"); 189 exit(1); 190 }; 191 192 /* Create kernel argument */ 193 err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); 194 if(err < 0) { 195 printf("Couldn't set a kernel argument"); 196 exit(1); 197 }; 198 199 /* Create kernel argument */ 200 dir = ASCENDING; 201 err = clSetKernelArg(kernel, 1, sizeof(int), &dir); 202 if(err < 0) { 203 printf("Couldn't set a kernel argument"); 204 exit(1); 205 }; 206 207 /* Create a command queue */ 208 queue = clCreateCommandQueue(context, device, 0, &err); 209 if(err < 0) { 210 perror("Couldn't create a command queue"); 211 exit(1); 212 }; 213 214 /* Enqueue kernel */ 215 err = clEnqueueTask(queue, kernel, 0, NULL, NULL); 216 if(err < 0) { 217 perror("Couldn't enqueue the kernel"); 218 exit(1); 219 } 220 221 /* Read and print the result */ 222 err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 223 sizeof(data), &data, 0, NULL, NULL); 224 if(err < 0) { 225 perror("Couldn't read the buffer"); 226 exit(1); 227 } 228 printf("Output: %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f\n", 229 data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); 230 231 /* Check the result */ 232 check = 1; 233 234 /* Check ascending sort */ 235 if(dir == ASCENDING) { 236 for(i=1; i<8; i++) { 237 if(data[i] < data[i-1]) { 238 check = 0; 239 break; 240 } 241 } 242 } 243 /* Check descending sort */ 244 if(dir == DESCENDING) { 245 for(i=1; i<8; i++) { 246 if(data[i] > data[i-1]) { 247 check = 0; 248 break; 249 } 250 } 251 } 252 253 /* Display check result */ 254 if(check) 255 printf("Bitonic sort succeeded.\n"); 256 else 257 printf("Bitonic sort failed.\n"); 258 259 /* Deallocate resources */ 260 clReleaseMemObject(data_buffer); 261 clReleaseKernel(kernel); 262 clReleaseCommandQueue(queue); 263 clReleaseProgram(program); 264 clReleaseContext(context); 265 return 0; 266 }
一个通用示例
主机程序:
// 获取设备
// 获取设备 cl_device_id create_device() { cl_platform_id platform; cl_device_id dev; int err; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); if(err == CL_DEVICE_NOT_FOUND) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); } if(err < 0) { perror("Couldn't access any devices"); exit(1); } return dev; }
// 创建并编译cl_program

1 // 创建并编译cl_program 2 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { 3 4 cl_program program; 5 FILE *program_handle; 6 char *program_buffer, *program_log; 7 size_t program_size, log_size; 8 int err; 9 10 /* Read program file and place content into buffer */ 11 program_handle = fopen(filename, "r"); 12 if(program_handle == NULL) { 13 perror("Couldn't find the program file"); 14 exit(1); 15 } 16 fseek(program_handle, 0, SEEK_END); 17 program_size = ftell(program_handle); 18 rewind(program_handle); 19 program_buffer = (char*)malloc(program_size + 1); 20 program_buffer[program_size] = '\0'; 21 fread(program_buffer, sizeof(char), program_size, program_handle); 22 fclose(program_handle); 23 24 /* Create program from file */ 25 program = clCreateProgramWithSource(ctx, 1, 26 (const char**)&program_buffer, &program_size, &err); 27 if(err < 0) { 28 perror("Couldn't create the program"); 29 exit(1); 30 } 31 free(program_buffer); 32 33 /* Build program */ 34 err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 35 if(err < 0) { 36 37 /* Find size of log and print to std output */ 38 clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 39 0, NULL, &log_size); 40 program_log = (char*) malloc(log_size + 1); 41 program_log[log_size] = '\0'; 42 clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 43 log_size + 1, program_log, NULL); 44 printf("%s\n", program_log); 45 free(program_log); 46 exit(1); 47 } 48 49 return program; 50 }
// main
#禁止不安全的错误警告 #define _CRT_SECURE_NO_WARNINGS #define PROGRAM_FILE "bsort.cl" #define BSORT_INIT "bsort_init" #define BSORT_STAGE_0 "bsort_stage_0" #define BSORT_STAGE_N "bsort_stage_n" #define BSORT_MERGE "bsort_merge" #define BSORT_MERGE_LAST "bsort_merge_last" /* Ascending: 0, Descending: -1 */ #define DIRECTION 0 #define NUM_FLOATS 1048576 #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #ifdef MAC #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif /* Find a GPU or CPU associated with the first available platform */ cl_device_id create_device() ; /* Create program from a file and compile it */ cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) ; int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel_init, kernel_stage_0, kernel_stage_n, kernel_merge, kernel_merge_last; cl_int i, err, check, direction; /* Data and buffers */ float data[NUM_FLOATS]; cl_mem data_buffer; cl_uint stage, high_stage, num_stages; size_t local_size, global_size; /* Initialize data */ srand(time(NULL)); for(i=0; i<NUM_FLOATS; i++) { data[i] = rand(); } /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program */ program = build_program(context, device, PROGRAM_FILE); /* Create kernels */ kernel_init = clCreateKernel(program, BSORT_INIT, &err); if(err < 0) { perror("Couldn't create the initial kernel"); exit(1); }; kernel_stage_0 = clCreateKernel(program, BSORT_STAGE_0, &err); if(err < 0) { perror("Couldn't create the stage_0 kernel"); exit(1); }; kernel_stage_n = clCreateKernel(program, BSORT_STAGE_N, &err); if(err < 0) { perror("Couldn't create the stage_n kernel"); exit(1); }; kernel_merge = clCreateKernel(program, BSORT_MERGE, &err); if(err < 0) { perror("Couldn't create the merge kernel"); exit(1); }; kernel_merge_last = clCreateKernel(program, BSORT_MERGE_LAST, &err); if(err < 0) { perror("Couldn't create the merge_last kernel"); exit(1); }; /* Determine maximum work-group size */ // 获取工作组中工作项的 数量限制 err = clGetKernelWorkGroupInfo(kernel_init, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); if(err < 0) { perror("Couldn't find the maximum work-group size"); exit(1); }; local_size = (int)pow(2, trunc(log2(local_size))); //函数 TRUNC 直接去除数字的小数部分 /* Create buffer */ data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel_init, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_stage_0, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_stage_n, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_merge, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_merge_last, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel_init, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_stage_0, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_stage_n, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_merge, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_merge_last, 1, 8*local_size*sizeof(float), NULL); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ // 创建命令队列 queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue initial sorting kernel */ global_size = NUM_FLOATS/8; if(global_size < local_size) { local_size = global_size; } err = clEnqueueNDRangeKernel(queue, kernel_init, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Execute further stages */ num_stages = global_size/local_size; for(high_stage = 2; high_stage < num_stages; high_stage <<= 1) { err = clSetKernelArg(kernel_stage_0, 2, sizeof(int), &high_stage); err |= clSetKernelArg(kernel_stage_n, 3, sizeof(int), &high_stage); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; for(stage = high_stage; stage > 1; stage >>= 1) { err = clSetKernelArg(kernel_stage_n, 2, sizeof(int), &stage); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; err = clEnqueueNDRangeKernel(queue, kernel_stage_n, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } } err = clEnqueueNDRangeKernel(queue, kernel_stage_0, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } } /* Set the sort direction */ direction = DIRECTION; err = clSetKernelArg(kernel_merge, 3, sizeof(int), &direction); err |= clSetKernelArg(kernel_merge_last, 2, sizeof(int), &direction); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Perform the bitonic merge */ for(stage = num_stages; stage > 1; stage >>= 1) { err = clSetKernelArg(kernel_merge, 2, sizeof(int), &stage); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; err = clEnqueueNDRangeKernel(queue, kernel_merge, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } } err = clEnqueueNDRangeKernel(queue, kernel_merge_last, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read the result */ err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, sizeof(data), &data, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } check = 1; /* Check ascending sort */ if(direction == 0) { for(i=1; i<NUM_FLOATS; i++) { if(data[i] < data[i-1]) { check = 0; break; } } } /* Check descending sort */ if(direction == -1) { for(i=1; i<NUM_FLOATS; i++) { if(data[i] > data[i-1]) { check = 0; break; } } } /* Display check result */ printf("Local size: %zu\n", local_size); printf("Global size: %zu\n", global_size); if(check) printf("Bitonic sort succeeded.\n"); else printf("Bitonic sort failed.\n"); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel_init); clReleaseKernel(kernel_stage_0); clReleaseKernel(kernel_stage_n); clReleaseKernel(kernel_merge); clReleaseKernel(kernel_merge_last); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
核函数:
/* Sort elements within a vector */ #define VECTOR_SORT(input, dir) \ comp = input < shuffle(input, mask2) ^ dir; \ input = shuffle(input, as_uint4(comp * 2 + add2)); \ comp = input < shuffle(input, mask1) ^ dir; \ input = shuffle(input, as_uint4(comp + add1)); \ #define VECTOR_SWAP(input1, input2, dir) \ temp = input1; \ comp = (input1 < input2 ^ dir) * 4 + add3; \ input1 = shuffle2(input1, input2, as_uint4(comp)); \ input2 = shuffle2(input2, temp, as_uint4(comp)); \ /* Perform initial sort */ __kernel void bsort_init(__global float4 *g_data, __local float4 *l_data) { int dir; uint id, global_start, size, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(1, 2, 2, 3); id = get_local_id(0) * 2; global_start = get_group_id(0) * get_local_size(0) * 2 + id; input1 = g_data[global_start]; input2 = g_data[global_start+1]; /* Sort input 1 - ascending */ comp = input1 < shuffle(input1, mask1); input1 = shuffle(input1, as_uint4(comp + add1)); comp = input1 < shuffle(input1, mask2); input1 = shuffle(input1, as_uint4(comp * 2 + add2)); comp = input1 < shuffle(input1, mask3); input1 = shuffle(input1, as_uint4(comp + add3)); /* Sort input 2 - descending */ comp = input2 > shuffle(input2, mask1); input2 = shuffle(input2, as_uint4(comp + add1)); comp = input2 > shuffle(input2, mask2); input2 = shuffle(input2, as_uint4(comp * 2 + add2)); comp = input2 > shuffle(input2, mask3); input2 = shuffle(input2, as_uint4(comp + add3)); /* Swap corresponding elements of input 1 and 2 */ add3 = (int4)(4, 5, 6, 7); dir = get_local_id(0) % 2 * -1; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); /* Sort data and store in local memory */ VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); l_data[id] = input1; l_data[id+1] = input2; /* Create bitonic set */ for(size = 2; size < get_local_size(0); size <<= 1) { dir = (get_local_id(0)/size & 1) * -1; for(stride = size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); l_data[id] = input1; l_data[id+1] = input2; } /* Perform bitonic merge */ dir = (get_group_id(0) % 2) * -1; for(stride = get_local_size(0); stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); g_data[global_start] = input1; g_data[global_start+1] = input2; } /* Perform lowest stage of the bitonic sort */ __kernel void bsort_stage_0(__global float4 *g_data, __local float4 *l_data, uint high_stage) { int dir; uint id, global_start, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(4, 5, 6, 7); /* Determine data location in global memory */ id = get_local_id(0); dir = (get_group_id(0)/high_stage & 1) * -1; global_start = get_group_id(0) * get_local_size(0) * 2 + id; /* Perform initial swap */ input1 = g_data[global_start]; input2 = g_data[global_start + get_local_size(0)]; comp = (input1 < input2 ^ dir) * 4 + add3; l_data[id] = shuffle2(input1, input2, as_uint4(comp)); l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp)); /* Perform bitonic merge */ for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); /* Store output in global memory */ g_data[global_start + get_local_id(0)] = input1; g_data[global_start + get_local_id(0) + 1] = input2; } /* Perform successive stages of the bitonic sort */ __kernel void bsort_stage_n(__global float4 *g_data, __local float4 *l_data, uint stage, uint high_stage) { int dir; float4 input1, input2; int4 comp, add; uint global_start, global_offset; add = (int4)(4, 5, 6, 7); /* Determine location of data in global memory */ dir = (get_group_id(0)/high_stage & 1) * -1; global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) * get_local_size(0) + get_local_id(0); global_offset = stage * get_local_size(0); /* Perform swap */ input1 = g_data[global_start]; input2 = g_data[global_start + global_offset]; comp = (input1 < input2 ^ dir) * 4 + add; g_data[global_start] = shuffle2(input1, input2, as_uint4(comp)); g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp)); } /* Sort the bitonic set */ __kernel void bsort_merge(__global float4 *g_data, __local float4 *l_data, uint stage, int dir) { float4 input1, input2; int4 comp, add; uint global_start, global_offset; add = (int4)(4, 5, 6, 7); /* Determine location of data in global memory */ global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) * get_local_size(0) + get_local_id(0); global_offset = stage * get_local_size(0); /* Perform swap */ input1 = g_data[global_start]; input2 = g_data[global_start + global_offset]; comp = (input1 < input2 ^ dir) * 4 + add; g_data[global_start] = shuffle2(input1, input2, as_uint4(comp)); g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp)); } /* Perform final step of the bitonic merge */ __kernel void bsort_merge_last(__global float4 *g_data, __local float4 *l_data, int dir) { uint id, global_start, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(4, 5, 6, 7); /* Determine location of data in global memory */ id = get_local_id(0); global_start = get_group_id(0) * get_local_size(0) * 2 + id; /* Perform initial swap */ input1 = g_data[global_start]; input2 = g_data[global_start + get_local_size(0)]; comp = (input1 < input2 ^ dir) * 4 + add3; l_data[id] = shuffle2(input1, input2, as_uint4(comp)); l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp)); /* Perform bitonic merge */ for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); /* Store the result to global memory */ g_data[global_start + get_local_id(0)] = input1; g_data[global_start + get_local_id(0) + 1] = input2; }
/* Sort elements within a vector */ #define VECTOR_SORT(input, dir) \ comp = input < shuffle(input, mask2) ^ dir; \ input = shuffle(input, as_uint4(comp * 2 + add2)); \ comp = input < shuffle(input, mask1) ^ dir; \ input = shuffle(input, as_uint4(comp + add1)); \ #define VECTOR_SWAP(input1, input2, dir) \ temp = input1; \ comp = (input1 < input2 ^ dir) * 4 + add3; \ input1 = shuffle2(input1, input2, as_uint4(comp)); \ input2 = shuffle2(input2, temp, as_uint4(comp)); \