OpenCL 增强单work-item kernel性能策略
1、基于反馈的Optimization Report解决单个Work-item的Kernel相关性
在许多情况下,将OpenCL™应用程序设计为单个工作项内核就足以在不执行其他优化步骤的情况下最大化性能。
建议采用以下优化单个work-item kernel的选项来按照实用性顺序解决单个work-item kernel循环携带的依赖性:
removal,relaxation,simplification,transfer to local memory。
(1) Removing Loop-Carried Dependency
根据优化报告的反馈,可以通过实现更简单的内存访问的方式来消除循环携带的依赖关系。
优化前:
1 #define N 128 2 3 __kernel void unoptimized (__global int * restrict A, 4 __global int * restrict B, 5 __global int* restrict result) 6 { 7 int sum = 0; 8 9 for (unsigned i = 0; i < N; i++) { 10 for (unsigned j = 0; j < N; j++) { 11 sum += A[i*N+j]; 12 } 13 sum += B[i]; 14 } 15 16 * result = sum; 17 }
优化后:
1 #define N 128 2 3 __kernel void optimized (__global int * restrict A, 4 __global int * restrict B, 5 __global int * restrict result) 6 { 7 int sum = 0; 8 9 for (unsigned i = 0; i < N; i++) { 10 // Step 1: Definition 11 int sum2 = 0; 12 13 // Step 2: Accumulation of array A values for one outer loop iteration 14 for (unsigned j = 0; j < N; j++) { 15 sum2 += A[i*N+j]; 16 } 17 18 // Step 3: Addition of array B value for an outer loop iteration 19 sum += sum2; 20 sum += B[i]; 21 } 22 23 * result = sum; 24 }
(2) Relaxing Loop-Carried Dependency
根据优化报告的反馈,可以通过增加依赖距离的方式来relax缓解循环携带的依赖关系。
优化前:
1 #define N 128 2 3 __kernel void unoptimized (__global float * restrict A, 4 __global float * restrict result) 5 { 6 float mul = 1.0f; 7 8 for (unsigned i = 0; i < N; i++) 9 mul *= A[i]; 10 11 * result = mul; 12 }
优化后:
1 #define N 128 2 #define M 8 3 4 __kernel void optimized (__global float * restrict A, 5 __global float * restrict result) 6 { 7 float mul = 1.0f; 8 9 // Step 1: Declare multiple copies of variable mul 10 float mul_copies[M]; 11 12 // Step 2: Initialize all copies 13 for (unsigned i = 0; i < M; i++) 14 mul_copies[i] = 1.0f; 15 16 for (unsigned i = 0; i < N; i++) { 17 // Step 3: Perform multiplication on the last copy 18 float cur = mul_copies[M-1] * A[i]; 19 20 // Step 4a: Shift copies 21 #pragma unroll 22 for (unsigned j = M-1; j > 0; j--) 23 mul_copies[j] = mul_copies[j-1]; 24 25 // Step 4b: Insert updated copy at the beginning 26 mul_copies[0] = cur; 27 } 28 29 // Step 5: Perform reduction on copies 30 #pragma unroll 31 for (unsigned i = 0; i < M; i++) 32 mul *= mul_copies[i]; 33 34 * result = mul; 35 }
(3) Transferring Loop-Carried Dependency to Local Memory
对于不能消除的循环携带的依赖关系,可以通过将具有循环依赖的数组从全局内存global memory移动到local memory的方式来改进循环的启动间隔(initiation interval, II)。
优化前:
1 #define N 128 2 3 __kernel void unoptimized( __global int* restrict A ) 4 { 5 for (unsigned i = 0; i < N; i++) 6 A[N-i] = A[i]; 7 }
优化后:
1 #define N 128 2 3 __kernel void optimized( __global int* restrict A ) 4 { 5 int B[N]; 6 7 for (unsigned i = 0; i < N; i++) 8 B[i] = A[i]; 9 10 for (unsigned i = 0; i < N; i++) 11 B[N-i] = B[i]; 12 13 for (unsigned i = 0; i < N; i++) 14 A[i] = B[i]; 15 }
(4) Relaxing Loop-Carried Dependency by Inferring Shift Registers
为了使SDK能够有效地处理执行双精度浮点运算的单个work-item kernels,可以通过推断移位寄存器的方式移除循环携带的依赖性。
优化前:
1 __kernel void double_add_1 (__global double *arr, 2 int N, 3 __global double *result) 4 { 5 double temp_sum = 0; 6 7 for (int i = 0; i < N; ++i) 8 { 9 temp_sum += arr[i]; 10 } 11 12 *result = temp_sum; 13 }
优化后:
1 //Shift register size must be statically determinable 2 #define II_CYCLES 12 3 4 __kernel void double_add_2 (__global double *arr, 5 int N, 6 __global double *result) 7 { 8 //Create shift register with II_CYCLE+1 elements 9 double shift_reg[II_CYCLES+1]; 10 11 //Initialize all elements of the register to 0 12 for (int i = 0; i < II_CYCLES + 1; i++) 13 { 14 shift_reg[i] = 0; 15 } 16 17 //Iterate through every element of input array 18 for(int i = 0; i < N; ++i) 19 { 20 //Load ith element into end of shift register 21 //if N > II_CYCLE, add to shift_reg[0] to preserve values 22 shift_reg[II_CYCLES] = shift_reg[0] + arr[i]; 23 24 #pragma unroll 25 //Shift every element of shift register 26 for(int j = 0; j < II_CYCLES; ++j) 27 { 28 shift_reg[j] = shift_reg[j + 1]; 29 } 30 } 31 32 //Sum every element of shift register 33 double temp_sum = 0; 34 35 #pragma unroll 36 for(int i = 0; i < II_CYCLES; ++i) 37 { 38 temp_sum += shift_reg[i]; 39 } 40 41 *result = temp_sum; 42 }
(5) Removing Loop-Carried Dependencies Cause by Accesses to Memory Arrays
在单个work-item的kernel中包含ivdep pragma以保证堆内存array的访问不会导致循环携带的依赖性。
如果对循环中的内存阵列的所有访问都不会导致循环携带的依赖性,在内核代码中的循环之前添加#pragma ivdep。
// no loop-carried dependencies for A and B array accesses #pragma ivdep for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; }
要指定对循环内特定内存数组的访问不会导致循环携带的依赖关系,在内核代码中的循环之前添加#pragma ivdep arrayarray_name)。
ivdep pragma指定的数组必须是local / private内存数组,或者是指向global/local/private内存存储的指针变量。 如果指定的数组是指针,则ivdep pragma也适用于所有可能带有指定指针别名的数组。
ivdep pragma指示指定的数组也可以是struct的数组或指针成员。
// No loop-carried dependencies for A array accesses // The offline compiler will insert hardware that reinforces dependency constraints for B #pragma ivdep array(A) for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; } // No loop-carried dependencies for array A inside struct #pragma ivdep array(S.A) for (int i = 0; i < N; i++) { S.A[i] = S.A[i - X[i]]; } // No loop-carried dependencies for array A inside the struct pointed by S #pragma ivdep array(S->X[2][3].A) for (int i = 0; i < N; i++) { S->X[2][3].A[i] = S.A[i - X[i]]; } // No loop-carried dependencies for A and B because ptr aliases // with both arrays int *ptr = select ? A : B; #pragma ivdep array(ptr) for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; } // No loop-carried dependencies for A because ptr only aliases with A int *ptr = &A[10]; #pragma ivdep array(ptr) for (int i = 0; i < N; i++) { A[i] = A[i - X[i]]; B[i] = B[i - Y[i]]; }
2、Single work-item kernel的设计技巧
避免指针混淆。使用restrict关键词。
创建格式正确的循环。退出条件与整数进行比较,且每次迭代的增量为1。格式正确的嵌套循环也对最大化kernel性能有帮助。
最小化loop-carried循环携带依赖遵循以下原则:
避免使用指针算数;
声明简单的数组索引;
尽可能在kernel中使用恒定边界的循环。
避免复杂的循环退出条件;
将嵌套循环转换为单循环;
避免条件循环; 避免if else中包含循环,尽量转换为在循环中包含if else。
在尽可能深的范围内声明变量。