计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。
▶ 源代码。
1 #include <iostream> 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include <helper_cuda.h> 5 6 const int manualBlockSize = 32; 7 8 // 核函数,输入数组的每个元素平方后放回 9 __global__ void square(int *array, int arrayCount) 10 { 11 extern __shared__ int dynamicSmem[]; 12 int idx = threadIdx.x + blockIdx.x * blockDim.x; 13 14 if (idx < arrayCount) 15 array[idx] *= array[idx]; 16 } 17 18 // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸 19 static int launchConfig(int *data, int size, bool automatic) 20 { 21 int blockSize; 22 int numBlocks; 23 int gridSize; 24 int minGridSize; 25 float elapsedTime; 26 double potentialOccupancy; 27 size_t dynamicSMemUsage = 0; 28 29 cudaDeviceProp prop; 30 cudaGetDeviceProperties(&prop, 0); 31 cudaEvent_t start; 32 cudaEvent_t end; 33 cudaEventCreate(&start); 34 cudaEventCreate(&end); 35 36 if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸 37 { 38 cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size); 39 printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize); 40 } 41 else 42 blockSize = manualBlockSize; 43 44 gridSize = (size + blockSize - 1) / blockSize; 45 46 cudaEventRecord(start); 47 square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size); 48 cudaEventRecord(end); 49 cudaDeviceSynchronize(); 50 cudaEventElapsedTime(&elapsedTime, start, end); 51 printf("\n\tElapsed time: %4.2f ms\n", elapsedTime); 52 53 // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价 54 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage); 55 potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor); 56 printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * 100); 57 58 return 0; 59 } 60 61 // 负责核函数调用前后内存控制,以及结果检查 62 static int test(bool automaticLaunchConfig, const int count = 1000000) 63 { 64 int size = count * sizeof(int); 65 int *h_data = (int *)malloc(size); 66 for (int i = 0; i < count; i++) 67 h_data[i] = i; 68 int *d_data; 69 cudaMalloc(&d_data, size); 70 71 cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice); 72 memset(h_data,0,size); 73 launchConfig(d_data, count, automaticLaunchConfig); 74 cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); 75 76 for (int i = 0; i < count; i += 1) 77 { 78 if (h_data[i] != i * i) 79 { 80 printf("\n\tError at %d, d_data = %d\n", i, h_data[i]); 81 return 1; 82 } 83 } 84 85 free(h_data); 86 cudaFree(d_data); 87 return 0; 88 } 89 90 int main() 91 { 92 int status; 93 94 printf("\n\tStart.\n"); 95 96 printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize); 97 if (test(false)) 98 { 99 printf("\n\tTest failed\n"); 100 return -1; 101 } 102 103 printf("\n\tAutomatic configuration\n"); 104 if (test(true)) 105 { 106 printf("\n\tTest failed\n"); 107 return -1; 108 } 109 110 printf("\n\tTest PASSED\n"); 111 getchar(); 112 return 0; 113 }
▶ 输出结果
Start. Manual configuration test, BlockSize = 32 Elapsed time: 0.13 ms Potential occupancy: 50.00 % Automatic configuration Suggested block size: 1024, minimum grid size for maximum occupancy: 32 Elapsed time: 0.12 ms Potential occupancy: 100.00 % Test PASSED
▶ 涨姿势
● 用到的几个 runtime 函数及其相互关系。
1 // driver_types.h 2 // 用于优化线程块尺寸的函数中的标志 3 #define cudaOccupancyDefault 0x00 // 默认标志 4 #define cudaOccupancyDisableCachingOverride 0x01 // 开启全局缓存,且不能被禁用 5 6 // cuda_device_runtime_api.h 7 // 与 cuda_runtime.h 中同名的函数,貌似没有用到? 8 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize) 9 { 10 return cudaErrorUnknown; 11 } 12 13 // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的 14 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags) 15 { 16 return cudaErrorUnknown; 17 } 18 19 // cuda_runtime.h 20 template<class T> 21 static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize) 22 { 23 return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault); 24 } 25 26 template<typename UnaryFunction, class T> 27 static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 28 ( 29 int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = 0, unsigned int flags = 0 30 ) 31 { 32 cudaError_t status; 33 34 // 设备和函数属性 35 int device; 36 struct cudaFuncAttributes attr; 37 int maxThreadsPerMultiProcessor; 38 int warpSize; 39 int devMaxThreadsPerBlock; 40 int multiProcessorCount; 41 int occupancyLimit; 42 int granularity; 43 44 // 记录最大值 45 int maxBlockSize = 0; 46 int numBlocks = 0; 47 int maxOccupancy = 0; 48 49 // 临时变量 50 int blockSizeToTryAligned; 51 int blockSizeToTry; 52 int occupancyInBlocks; 53 int occupancyInThreads; 54 size_t dynamicSMemSize; 55 56 // 检查输入 57 if (!minGridSize || !blockSize || !func) 58 return cudaErrorInvalidValue; 59 60 //获取设备和核函数属性 61 status = ::cudaGetDevice(&device); 62 if (status != cudaSuccess) 63 return status; 64 status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device); 65 if (status != cudaSuccess) 66 return status; 67 status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device); 68 if (status != cudaSuccess) 69 return status; 70 status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device); 71 if (status != cudaSuccess) 72 return status; 73 status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device); 74 if (status != cudaSuccess) 75 return status; 76 status = cudaFuncGetAttributes(&attr, func); 77 if (status != cudaSuccess) 78 return status; 79 80 //尝试线程块尺寸 81 occupancyLimit = maxThreadsPerMultiProcessor; 82 granularity = warpSize; 83 84 if (blockSizeLimit == 0 || blockSizeLimit > devMaxThreadsPerBlock) 85 blockSizeLimit = devMaxThreadsPerBlock; 86 87 if (blockSizeLimit > attr.maxThreadsPerBlock) 88 blockSizeLimit = attr.maxThreadsPerBlock; 89 90 for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - 1)) / warpSize) * warpSize; blockSizeToTryAligned > 0; blockSizeToTryAligned -= warpSize) 91 // blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少 92 // 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次) 93 { 94 blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned; 95 dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); 96 97 // 计算占用率的核心 98 status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags); 99 if (status != cudaSuccess) 100 return status; 101 102 // 记录有效结果 103 if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy) 104 { 105 maxBlockSize = blockSizeToTry; 106 numBlocks = occupancyInBlocks; 107 maxOccupancy = occupancyInThreads; 108 } 109 110 // 已经达到了占用率 100%,退出 111 if (occupancyLimit == maxOccupancy) 112 break; 113 } 114 115 // 返回最优结果 116 *minGridSize = numBlocks * multiProcessorCount; 117 *blockSize = maxBlockSize; 118 119 return status; 120 } 121 122 class __cudaOccupancyB2DHelper 123 { 124 size_t n; 125 public: 126 inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {} 127 inline __host__ CUDART_DEVICE size_t operator()(int) 128 { 129 return n; 130 } 131 }; 132 133 // 优化线程块尺寸的 runtime 函数 134 // 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit 135 template<class T> 136 static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize 137 ( 138 int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = 0, int blockSizeLimit = 0 139 ) 140 { 141 return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault); 142 }