opencl(二十五)----双调排序

参考:《opencl实战》 

双调排序

一个序列:进行升序排列

6   1   4   5     7    2    3    8

a、左右两部分别 升序、降序

1  4  5   6      8  7  3  2

b 、左右度应位置比较,小的左移

1 4  3   2       8  7   5  6

c、左右都整成升序

1 2 3 4       5  6   7   8

 

注:四个元素如何排序

 

 

 opencl 中可用代码如下:

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(1, 2, 2, 3);
   int4 add4 = (int4)(4, 5, 6, 7);

// float4* : input input1 input2 // dir : 0升 -1降 #define UP 0
#define DOWN -1

/* Sort elements in a vector */
#define SORT_VECTOR(input, dir)                                   \
   comp = input < shuffle(input, mask1) ^ dir;                    \
   input = shuffle(input, as_uint4(comp + add1));                 \
   comp = input < shuffle(input, mask2) ^ dir;                    \
   input = shuffle(input, as_uint4(comp * 2 + add2));             \
   comp = input < shuffle(input, mask3) ^ dir;                    \
   input = shuffle(input, as_uint4(comp + add3));                 \


/* Sort elements between two vectors */
#define SWAP_VECTORS(input1, input2, dir)                         \
   temp = input1;                                                 \
   comp = (input1 < input2 ^ dir) * 4 + add4;                     \
   input1 = shuffle2(input1, input2, as_uint4(comp));             \
   input2 = shuffle2(input2, temp, as_uint4(comp));               \

demo:  对8个数值排序

  1 /****************************  kernel *******************************/
  2 #define UP 0
  3 #define DOWN -1
  4 
  5 /* Sort elements in a vector */
  6 #define SORT_VECTOR(input, dir)                                   \
  7    comp = input < shuffle(input, mask1) ^ dir;                    \
  8    input = shuffle(input, as_uint4(comp + add1));                 \
  9    comp = input < shuffle(input, mask2) ^ dir;                    \
 10    input = shuffle(input, as_uint4(comp * 2 + add2));             \
 11    comp = input < shuffle(input, mask3) ^ dir;                    \
 12    input = shuffle(input, as_uint4(comp + add3));                 \
 13 
 14 /* Sort elements between two vectors */
 15 #define SWAP_VECTORS(input1, input2, dir)                         \
 16    temp = input1;                                                 \
 17    comp = (input1 < input2 ^ dir) * 4 + add4;                     \
 18    input1 = shuffle2(input1, input2, as_uint4(comp));             \
 19    input2 = shuffle2(input2, temp, as_uint4(comp));               \
 20 
 21 __kernel void bsort8(__global float4 *data, int dir) {
 22 
 23    float4 input1, input2, temp;
 24    int4 comp;
 25 
 26    uint4 mask1 = (uint4)(1, 0, 3, 2);
 27    uint4 mask2 = (uint4)(2, 3, 0, 1);
 28    uint4 mask3 = (uint4)(3, 2, 1, 0);
 29 
 30    int4 add1 = (int4)(1, 1, 3, 3);
 31    int4 add2 = (int4)(2, 3, 2, 3);
 32    int4 add3 = (int4)(1, 2, 2, 3);
 33    int4 add4 = (int4)(4, 5, 6, 7);
 34 
 35    input1 = data[0];
 36    input2 = data[1];
 37 
 38    SORT_VECTOR(input1, UP)
 39    SORT_VECTOR(input2, DOWN)
 40 
 41    SWAP_VECTORS(input1, input2, dir)
 42 
 43    SORT_VECTOR(input1, dir)
 44    SORT_VECTOR(input2, dir)
 45 
 46    data[0] = input1;
 47    data[1] = input2;
 48 }
 49 
 50 
 51 /**************************************  主机程序  ***************************************/
 52 #define _CRT_SECURE_NO_WARNINGS
 53 #define PROGRAM_FILE "bsort8.cl"
 54 #define KERNEL_FUNC "bsort8"
 55 
 56 #define ASCENDING 0
 57 #define DESCENDING -1
 58 
 59 #include <stdio.h>
 60 #include <stdlib.h>
 61 #include <string.h>
 62 #include <time.h>
 63 
 64 #ifdef MAC
 65 #include <OpenCL/cl.h>
 66 #else
 67 #include <CL/cl.h>
 68 #endif
 69 
 70 /* Find a GPU or CPU associated with the first available platform */
 71 cl_device_id create_device() {
 72 
 73    cl_platform_id platform;
 74    cl_device_id dev;
 75    int err;
 76 
 77    /* Identify a platform */
 78    err = clGetPlatformIDs(1, &platform, NULL);
 79    if(err < 0) {
 80       perror("Couldn't identify a platform");
 81       exit(1);
 82    } 
 83 
 84    /* Access a device */
 85    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
 86    if(err == CL_DEVICE_NOT_FOUND) {
 87       err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
 88    }
 89    if(err < 0) {
 90       perror("Couldn't access any devices");
 91       exit(1);   
 92    }
 93 
 94    return dev;
 95 }
 96 
 97 /* Create program from a file and compile it */
 98 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
 99 
100    cl_program program;
101    FILE *program_handle;
102    char *program_buffer, *program_log;
103    size_t program_size, log_size;
104    int err;
105 
106    /* Read program file and place content into buffer */
107    program_handle = fopen(filename, "r");
108    if(program_handle == NULL) {
109       perror("Couldn't find the program file");
110       exit(1);
111    }
112    fseek(program_handle, 0, SEEK_END);
113    program_size = ftell(program_handle);
114    rewind(program_handle);
115    program_buffer = (char*)malloc(program_size + 1);
116    program_buffer[program_size] = '\0';
117    fread(program_buffer, sizeof(char), program_size, program_handle);
118    fclose(program_handle);
119 
120    /* Create program from file */
121    program = clCreateProgramWithSource(ctx, 1, 
122       (const char**)&program_buffer, &program_size, &err);
123    if(err < 0) {
124       perror("Couldn't create the program");
125       exit(1);
126    }
127    free(program_buffer);
128 
129    /* Build program */
130    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
131    if(err < 0) {
132 
133       /* Find size of log and print to std output */
134       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
135             0, NULL, &log_size);
136       program_log = (char*) malloc(log_size + 1);
137       program_log[log_size] = '\0';
138       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
139             log_size + 1, program_log, NULL);
140       printf("%s\n", program_log);
141       free(program_log);
142       exit(1);
143    }
144 
145    return program;
146 }
147 
148 int main() {
149 
150    /* Host/device data structures */
151    cl_device_id device;
152    cl_context context;
153    cl_command_queue queue;
154    cl_program program;
155    cl_kernel kernel;
156    cl_int i, err, dir, check;
157 
158    /* Data and buffers */
159    float data[8];
160    cl_mem data_buffer;
161    
162    /* Initialize data */
163    data[0] = 3.0f; data[1] = 5.0f; data[2] = 4.0f; data[3] = 6.0f;
164    data[4] = 0.0f; data[5] = 7.0f; data[6] = 2.0f; data[7] = 1.0f;
165    printf("Input:  %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f\n",
166        data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
167 
168    /* Create a device and context */
169    device = create_device();
170    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
171    if(err < 0) {
172       perror("Couldn't create a context");
173       exit(1);   
174    }
175 
176    /* Create a kernel */
177    program = build_program(context, device, PROGRAM_FILE);
178    kernel = clCreateKernel(program, KERNEL_FUNC, &err);
179    if(err < 0) {
180       perror("Couldn't create a kernel");
181       exit(1);   
182    };
183 
184    /* Create buffer */
185    data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
186          CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err);
187    if(err < 0) {
188       perror("Couldn't create a buffer");
189       exit(1);   
190    };
191 
192    /* Create kernel argument */
193    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
194    if(err < 0) {
195       printf("Couldn't set a kernel argument");
196       exit(1);
197    };
198 
199    /* Create kernel argument */
200    dir = ASCENDING;
201    err = clSetKernelArg(kernel, 1, sizeof(int), &dir);
202    if(err < 0) {
203       printf("Couldn't set a kernel argument");
204       exit(1);
205    };
206 
207    /* Create a command queue */
208    queue = clCreateCommandQueue(context, device, 0, &err);
209    if(err < 0) {
210       perror("Couldn't create a command queue");
211       exit(1);   
212    };
213 
214    /* Enqueue kernel */
215    err = clEnqueueTask(queue, kernel, 0, NULL, NULL); 
216    if(err < 0) {
217       perror("Couldn't enqueue the kernel");
218       exit(1);   
219    }
220 
221    /* Read and print the result */
222    err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
223       sizeof(data), &data, 0, NULL, NULL);
224    if(err < 0) {
225       perror("Couldn't read the buffer");
226       exit(1);   
227    }
228    printf("Output: %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f %3.1f\n",
229        data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
230 
231    /* Check the result */
232    check = 1;
233 
234    /* Check ascending sort */
235    if(dir == ASCENDING) {
236       for(i=1; i<8; i++) {
237          if(data[i] < data[i-1]) {
238             check = 0;
239             break;
240          }
241       }
242    }
243    /* Check descending sort */
244    if(dir == DESCENDING) {
245       for(i=1; i<8; i++) {
246          if(data[i] > data[i-1]) {
247             check = 0;
248             break;
249          }
250       }
251    }
252 
253    /* Display check result */
254    if(check)
255       printf("Bitonic sort succeeded.\n");
256    else
257       printf("Bitonic sort failed.\n");
258 
259    /* Deallocate resources */
260    clReleaseMemObject(data_buffer);
261    clReleaseKernel(kernel);
262    clReleaseCommandQueue(queue);
263    clReleaseProgram(program);
264    clReleaseContext(context);
265    return 0;
266 }
View Code

 

一个通用示例

主机程序:

// 获取设备

// 获取设备
cl_device_id create_device() {

   cl_platform_id platform;
   cl_device_id dev;
   int err;

   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn't identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
   if(err == CL_DEVICE_NOT_FOUND) {
      err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
   }
   if(err < 0) {
      perror("Couldn't access any devices");
      exit(1);   
   }

   return dev;
}

// 创建并编译cl_program

 1 // 创建并编译cl_program
 2 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
 3 
 4    cl_program program;
 5    FILE *program_handle;
 6    char *program_buffer, *program_log;
 7    size_t program_size, log_size;
 8    int err;
 9 
10    /* Read program file and place content into buffer */
11    program_handle = fopen(filename, "r");
12    if(program_handle == NULL) {
13       perror("Couldn't find the program file");
14       exit(1);
15    }
16    fseek(program_handle, 0, SEEK_END);
17    program_size = ftell(program_handle);
18    rewind(program_handle);
19    program_buffer = (char*)malloc(program_size + 1);
20    program_buffer[program_size] = '\0';
21    fread(program_buffer, sizeof(char), program_size, program_handle);
22    fclose(program_handle);
23 
24    /* Create program from file */
25    program = clCreateProgramWithSource(ctx, 1, 
26       (const char**)&program_buffer, &program_size, &err);
27    if(err < 0) {
28       perror("Couldn't create the program");
29       exit(1);
30    }
31    free(program_buffer);
32 
33    /* Build program */
34    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
35    if(err < 0) {
36 
37       /* Find size of log and print to std output */
38       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
39             0, NULL, &log_size);
40       program_log = (char*) malloc(log_size + 1);
41       program_log[log_size] = '\0';
42       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
43             log_size + 1, program_log, NULL);
44       printf("%s\n", program_log);
45       free(program_log);
46       exit(1);
47    }
48 
49    return program;
50 }
View Code

// main

#禁止不安全的错误警告
#define _CRT_SECURE_NO_WARNINGS  
#define PROGRAM_FILE       "bsort.cl"
#define BSORT_INIT         "bsort_init"
#define BSORT_STAGE_0      "bsort_stage_0"
#define BSORT_STAGE_N      "bsort_stage_n"
#define BSORT_MERGE        "bsort_merge"
#define BSORT_MERGE_LAST   "bsort_merge_last"

/* Ascending: 0, Descending: -1 */
#define DIRECTION 0
#define NUM_FLOATS 1048576

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>       

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() ;

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) ;
int main() {

   /* Host/device data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel_init, kernel_stage_0, kernel_stage_n, kernel_merge,
         kernel_merge_last;
   cl_int i, err, check, direction;

   /* Data and buffers */
   float data[NUM_FLOATS];
   cl_mem data_buffer;
   cl_uint stage, high_stage, num_stages;
   size_t local_size, global_size;

   /* Initialize data */
   srand(time(NULL));
   for(i=0; i<NUM_FLOATS; i++) {
      data[i] = rand();
   }

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn't create a context");
      exit(1);   
   }

   /* Build the program */
   program = build_program(context, device, PROGRAM_FILE);

   /* Create kernels */
   kernel_init = clCreateKernel(program, BSORT_INIT, &err);
   if(err < 0) {
      perror("Couldn't create the initial kernel");
      exit(1);   
   };
   kernel_stage_0 = clCreateKernel(program, BSORT_STAGE_0, &err);
   if(err < 0) {
      perror("Couldn't create the stage_0 kernel");
      exit(1);   
   };
   kernel_stage_n = clCreateKernel(program, BSORT_STAGE_N, &err);
   if(err < 0) {
      perror("Couldn't create the stage_n kernel");
      exit(1);   
   };
   kernel_merge = clCreateKernel(program, BSORT_MERGE, &err);
   if(err < 0) {
      perror("Couldn't create the merge kernel");
      exit(1);   
   };
   kernel_merge_last = clCreateKernel(program, BSORT_MERGE_LAST, &err);
   if(err < 0) {
      perror("Couldn't create the merge_last kernel");
      exit(1);   
   };

   /* Determine maximum work-group size */
   // 获取工作组中工作项的 数量限制
   err = clGetKernelWorkGroupInfo(kernel_init, device, CL_KERNEL_WORK_GROUP_SIZE,
      sizeof(local_size), &local_size, NULL);
   if(err < 0) {
      perror("Couldn't find the maximum work-group size");
      exit(1);   
   };
   local_size = (int)pow(2, trunc(log2(local_size))); //函数 TRUNC 直接去除数字的小数部分


   /* Create buffer */
   data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err);
   if(err < 0) {
      perror("Couldn't create a buffer");
      exit(1);   
   };

   /* Create kernel argument */
   err = clSetKernelArg(kernel_init, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_stage_0, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_stage_n, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_merge, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_merge_last, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);
   };

   /* Create kernel argument */
   err = clSetKernelArg(kernel_init, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_stage_0, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_stage_n, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_merge, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_merge_last, 1, 8*local_size*sizeof(float), NULL);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);
   };

   /* Create a command queue */
   // 创建命令队列
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn't create a command queue");
      exit(1);   
   };

   /* Enqueue initial sorting kernel */
   global_size = NUM_FLOATS/8;
   if(global_size < local_size) {
      local_size = global_size;
   }
   err = clEnqueueNDRangeKernel(queue, kernel_init, 1, NULL, &global_size, 
         &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Execute further stages */
   num_stages = global_size/local_size;
   for(high_stage = 2; high_stage < num_stages; high_stage <<= 1) {

      err = clSetKernelArg(kernel_stage_0, 2, sizeof(int), &high_stage);      
      err |= clSetKernelArg(kernel_stage_n, 3, sizeof(int), &high_stage);
      if(err < 0) {
         printf("Couldn't set a kernel argument");
         exit(1);
      };

      for(stage = high_stage; stage > 1; stage >>= 1) {

         err = clSetKernelArg(kernel_stage_n, 2, sizeof(int), &stage);
         if(err < 0) {
            printf("Couldn't set a kernel argument");
            exit(1);
         };

         err = clEnqueueNDRangeKernel(queue, kernel_stage_n, 1, NULL, 
               &global_size, &local_size, 0, NULL, NULL); 
         if(err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);   
         }
      }

      err = clEnqueueNDRangeKernel(queue, kernel_stage_0, 1, NULL, 
            &global_size, &local_size, 0, NULL, NULL); 
      if(err < 0) {
         perror("Couldn't enqueue the kernel");
         exit(1);   
      }
   }

   /* Set the sort direction */
   direction = DIRECTION;
   err = clSetKernelArg(kernel_merge, 3, sizeof(int), &direction);
   err |= clSetKernelArg(kernel_merge_last, 2, sizeof(int), &direction);
   if(err < 0) {
      printf("Couldn't set a kernel argument");
      exit(1);
   };

   /* Perform the bitonic merge */
   for(stage = num_stages; stage > 1; stage >>= 1) {

      err = clSetKernelArg(kernel_merge, 2, sizeof(int), &stage);
      if(err < 0) {
         printf("Couldn't set a kernel argument");
         exit(1);
      };

      err = clEnqueueNDRangeKernel(queue, kernel_merge, 1, NULL, 
            &global_size, &local_size, 0, NULL, NULL); 
      if(err < 0) {
         perror("Couldn't enqueue the kernel");
         exit(1);   
      }
   }
   err = clEnqueueNDRangeKernel(queue, kernel_merge_last, 1, NULL, 
         &global_size, &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn't enqueue the kernel");
      exit(1);   
   }

   /* Read the result */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
      sizeof(data), &data, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn't read the buffer");
      exit(1);   
   }

   check = 1;

   /* Check ascending sort */
   if(direction == 0) {
      for(i=1; i<NUM_FLOATS; i++) {
         if(data[i] < data[i-1]) {
            check = 0;
            break;
         }
      }
   }
   /* Check descending sort */
   if(direction == -1) {
      for(i=1; i<NUM_FLOATS; i++) {
         if(data[i] > data[i-1]) {
            check = 0;
            break;
         }
      }
   }

   /* Display check result */
   printf("Local size: %zu\n", local_size);
   printf("Global size: %zu\n", global_size);
   if(check)
      printf("Bitonic sort succeeded.\n");
   else
      printf("Bitonic sort failed.\n");

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel_init);
   clReleaseKernel(kernel_stage_0);
   clReleaseKernel(kernel_stage_n);
   clReleaseKernel(kernel_merge);
   clReleaseKernel(kernel_merge_last);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}

 

核函数:

/* Sort elements within a vector */
#define VECTOR_SORT(input, dir)                                   \
   comp = input < shuffle(input, mask2) ^ dir;                    \
   input = shuffle(input, as_uint4(comp * 2 + add2));             \
   comp = input < shuffle(input, mask1) ^ dir;                    \
   input = shuffle(input, as_uint4(comp + add1));                 \

#define VECTOR_SWAP(input1, input2, dir)                          \
   temp = input1;                                                 \
   comp = (input1 < input2 ^ dir) * 4 + add3;                     \
   input1 = shuffle2(input1, input2, as_uint4(comp));             \
   input2 = shuffle2(input2, temp, as_uint4(comp));               \

/* Perform initial sort */
__kernel void bsort_init(__global float4 *g_data, __local float4 *l_data) {

   int dir;
   uint id, global_start, size, stride;
   float4 input1, input2, temp;
   int4 comp;

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(1, 2, 2, 3);

   id = get_local_id(0) * 2;
   global_start = get_group_id(0) * get_local_size(0) * 2 + id;

   input1 = g_data[global_start]; 
   input2 = g_data[global_start+1];

   /* Sort input 1 - ascending */
   comp = input1 < shuffle(input1, mask1);
   input1 = shuffle(input1, as_uint4(comp + add1));
   comp = input1 < shuffle(input1, mask2);
   input1 = shuffle(input1, as_uint4(comp * 2 + add2));
   comp = input1 < shuffle(input1, mask3);
   input1 = shuffle(input1, as_uint4(comp + add3));

   /* Sort input 2 - descending */
   comp = input2 > shuffle(input2, mask1);
   input2 = shuffle(input2, as_uint4(comp + add1));
   comp = input2 > shuffle(input2, mask2);
   input2 = shuffle(input2, as_uint4(comp * 2 + add2));
   comp = input2 > shuffle(input2, mask3);
   input2 = shuffle(input2, as_uint4(comp + add3));     

   /* Swap corresponding elements of input 1 and 2 */
   add3 = (int4)(4, 5, 6, 7);
   dir = get_local_id(0) % 2 * -1;
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));

   /* Sort data and store in local memory */
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);
   l_data[id] = input1;
   l_data[id+1] = input2;

   /* Create bitonic set */
   for(size = 2; size < get_local_size(0); size <<= 1) {
      dir = (get_local_id(0)/size & 1) * -1;

      for(stride = size; stride > 1; stride >>= 1) {
         barrier(CLK_LOCAL_MEM_FENCE);
         id = get_local_id(0) + (get_local_id(0)/stride)*stride;
         VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
      }

      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) * 2;
      input1 = l_data[id]; input2 = l_data[id+1];
      temp = input1;
      comp = (input1 < input2 ^ dir) * 4 + add3;
      input1 = shuffle2(input1, input2, as_uint4(comp));
      input2 = shuffle2(input2, temp, as_uint4(comp));
      VECTOR_SORT(input1, dir);
      VECTOR_SORT(input2, dir);
      l_data[id] = input1;
      l_data[id+1] = input2;
   }

   /* Perform bitonic merge */
   dir = (get_group_id(0) % 2) * -1;
   for(stride = get_local_size(0); stride > 1; stride >>= 1) {
      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
      VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
   }
   barrier(CLK_LOCAL_MEM_FENCE);

   /* Perform final sort */
   id = get_local_id(0) * 2;
   input1 = l_data[id]; input2 = l_data[id+1];
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);
   g_data[global_start] = input1;
   g_data[global_start+1] = input2;
}

/* Perform lowest stage of the bitonic sort */
__kernel void bsort_stage_0(__global float4 *g_data, __local float4 *l_data, 
                            uint high_stage) {

   int dir;
   uint id, global_start, stride;
   float4 input1, input2, temp;
   int4 comp;

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(4, 5, 6, 7);

   /* Determine data location in global memory */
   id = get_local_id(0);
   dir = (get_group_id(0)/high_stage & 1) * -1;
   global_start = get_group_id(0) * get_local_size(0) * 2 + id;

   /* Perform initial swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + get_local_size(0)];
   comp = (input1 < input2 ^ dir) * 4 + add3;
   l_data[id] = shuffle2(input1, input2, as_uint4(comp));
   l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp));

   /* Perform bitonic merge */
   for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) {
      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
      VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
   }
   barrier(CLK_LOCAL_MEM_FENCE);

   /* Perform final sort */
   id = get_local_id(0) * 2;
   input1 = l_data[id]; input2 = l_data[id+1];
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);

   /* Store output in global memory */
   g_data[global_start + get_local_id(0)] = input1;
   g_data[global_start + get_local_id(0) + 1] = input2;
}

/* Perform successive stages of the bitonic sort */
__kernel void bsort_stage_n(__global float4 *g_data, __local float4 *l_data, 
                            uint stage, uint high_stage) {

   int dir;
   float4 input1, input2;
   int4 comp, add;
   uint global_start, global_offset;

   add = (int4)(4, 5, 6, 7);

   /* Determine location of data in global memory */
   dir = (get_group_id(0)/high_stage & 1) * -1;
   global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) *
                   get_local_size(0) + get_local_id(0);
   global_offset = stage * get_local_size(0);

   /* Perform swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + global_offset];
   comp = (input1 < input2 ^ dir) * 4 + add;
   g_data[global_start] = shuffle2(input1, input2, as_uint4(comp));
   g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp));
}

/* Sort the bitonic set */
__kernel void bsort_merge(__global float4 *g_data, __local float4 *l_data, uint stage, int dir) {

   float4 input1, input2;
   int4 comp, add;
   uint global_start, global_offset;

   add = (int4)(4, 5, 6, 7);

   /* Determine location of data in global memory */
   global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) *
                   get_local_size(0) + get_local_id(0);
   global_offset = stage * get_local_size(0);

   /* Perform swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + global_offset];
   comp = (input1 < input2 ^ dir) * 4 + add;
   g_data[global_start] = shuffle2(input1, input2, as_uint4(comp));
   g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp));
}

/* Perform final step of the bitonic merge */
__kernel void bsort_merge_last(__global float4 *g_data, __local float4 *l_data, int dir) {

   uint id, global_start, stride;
   float4 input1, input2, temp;
   int4 comp;

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(4, 5, 6, 7);

   /* Determine location of data in global memory */
   id = get_local_id(0);
   global_start = get_group_id(0) * get_local_size(0) * 2 + id;

   /* Perform initial swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + get_local_size(0)];
   comp = (input1 < input2 ^ dir) * 4 + add3;
   l_data[id] = shuffle2(input1, input2, as_uint4(comp));
   l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp));

   /* Perform bitonic merge */
   for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) {
      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
      VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
   }
   barrier(CLK_LOCAL_MEM_FENCE);

   /* Perform final sort */
   id = get_local_id(0) * 2;
   input1 = l_data[id]; input2 = l_data[id+1];
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);

   /* Store the result to global memory */
   g_data[global_start + get_local_id(0)] = input1;
   g_data[global_start + get_local_id(0) + 1] = input2;
}
/* Sort elements within a vector */
#define VECTOR_SORT(input, dir)                                   \
   comp = input < shuffle(input, mask2) ^ dir;                    \
   input = shuffle(input, as_uint4(comp * 2 + add2));             \
   comp = input < shuffle(input, mask1) ^ dir;                    \
   input = shuffle(input, as_uint4(comp + add1));                 \

#define VECTOR_SWAP(input1, input2, dir)                          \
   temp = input1;                                                 \
   comp = (input1 < input2 ^ dir) * 4 + add3;                     \
   input1 = shuffle2(input1, input2, as_uint4(comp));             \
   input2 = shuffle2(input2, temp, as_uint4(comp));               \

posted on 2020-01-05 18:38  feihu_h  阅读(979)  评论(0)    收藏  举报

导航