爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。


▶ 源代码。

  1 #include <iostream>
  2 #include "cuda_runtime.h"
  3 #include "device_launch_parameters.h"
  4 #include <helper_cuda.h>         
  5 
  6 const int manualBlockSize = 32;
  7 
  8 // 核函数,输入数组的每个元素平方后放回
  9 __global__ void square(int *array, int arrayCount)
 10 {
 11     extern __shared__ int dynamicSmem[];
 12     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 13     
 14     if (idx < arrayCount)
 15         array[idx] *= array[idx];
 16 }
 17 
 18 // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
 19 static int launchConfig(int *data, int size, bool automatic)
 20 {
 21     int blockSize;
 22     int numBlocks;
 23     int gridSize;
 24     int minGridSize;
 25     float elapsedTime;
 26     double potentialOccupancy;
 27     size_t dynamicSMemUsage = 0;
 28     
 29     cudaDeviceProp prop;
 30     cudaGetDeviceProperties(&prop, 0);
 31     cudaEvent_t start;
 32     cudaEvent_t end;
 33     cudaEventCreate(&start);
 34     cudaEventCreate(&end);
 35 
 36     if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
 37     {
 38         cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
 39         printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize);
 40     }
 41     else
 42         blockSize = manualBlockSize;
 43 
 44     gridSize = (size + blockSize - 1) / blockSize;
 45 
 46     cudaEventRecord(start);
 47     square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
 48     cudaEventRecord(end);
 49     cudaDeviceSynchronize();
 50     cudaEventElapsedTime(&elapsedTime, start, end);
 51     printf("\n\tElapsed time: %4.2f ms\n", elapsedTime);
 52     
 53     // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
 54     cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage); 
 55     potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor); 
 56     printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * 100);
 57 
 58     return 0;
 59 }
 60 
 61 // 负责核函数调用前后内存控制,以及结果检查
 62 static int test(bool automaticLaunchConfig, const int count = 1000000)
 63 {
 64     int size = count * sizeof(int);
 65     int *h_data = (int *)malloc(size);
 66     for (int i = 0; i < count; i++)
 67         h_data[i] = i;
 68     int *d_data;
 69     cudaMalloc(&d_data, size);
 70     
 71     cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
 72     memset(h_data,0,size);
 73     launchConfig(d_data, count, automaticLaunchConfig);
 74     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
 75     
 76     for (int i = 0; i < count; i += 1)
 77     {
 78         if (h_data[i] != i * i)
 79         {
 80             printf("\n\tError at %d, d_data = %d\n", i, h_data[i]);
 81             return 1;
 82         }
 83     }             
 84 
 85     free(h_data);
 86     cudaFree(d_data);
 87     return 0;
 88 }
 89 
 90 int main()
 91 {
 92     int status;
 93 
 94     printf("\n\tStart.\n");
 95     
 96     printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize);
 97     if (test(false))
 98     {
 99         printf("\n\tTest failed\n");
100         return -1;
101     }
102 
103     printf("\n\tAutomatic configuration\n");
104     if (test(true))
105     {
106         printf("\n\tTest failed\n");
107         return -1;
108     }        
109     
110     printf("\n\tTest PASSED\n");
111     getchar();
112     return 0;
113 }

▶ 输出结果

    Start.

    Manual configuration test, BlockSize = 32

    Elapsed time: 0.13 ms

    Potential occupancy: 50.00 %

    Automatic configuration

    Suggested block size: 1024, minimum grid size for maximum occupancy: 32

    Elapsed time: 0.12 ms

    Potential occupancy: 100.00 %

    Test PASSED

 

▶ 涨姿势

● 用到的几个 runtime 函数及其相互关系。

  1 // driver_types.h
  2 // 用于优化线程块尺寸的函数中的标志
  3 #define cudaOccupancyDefault                0x00  // 默认标志
  4 #define cudaOccupancyDisableCachingOverride 0x01  // 开启全局缓存,且不能被禁用
  5 
  6 // cuda_device_runtime_api.h
  7 // 与 cuda_runtime.h 中同名的函数,貌似没有用到?
  8 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
  9 {
 10     return cudaErrorUnknown;
 11 }
 12 
 13 // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
 14 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
 15 {
 16     return cudaErrorUnknown;
 17 }
 18 
 19 // cuda_runtime.h
 20 template<class T>
 21 static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
 22 {
 23     return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
 24 }
 25 
 26 template<typename UnaryFunction, class T>
 27 static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
 28 (
 29     int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = 0, unsigned int flags = 0
 30 )
 31 {    
 32     cudaError_t status;
 33 
 34     // 设备和函数属性
 35     int                       device;
 36     struct cudaFuncAttributes attr;
 37     int maxThreadsPerMultiProcessor;
 38     int warpSize;
 39     int devMaxThreadsPerBlock;
 40     int multiProcessorCount;
 41     int occupancyLimit;
 42     int granularity;
 43 
 44     // 记录最大值
 45     int maxBlockSize = 0;
 46     int numBlocks = 0;
 47     int maxOccupancy = 0;
 48 
 49     // 临时变量
 50     int blockSizeToTryAligned;
 51     int blockSizeToTry;
 52     int occupancyInBlocks;
 53     int occupancyInThreads;
 54     size_t dynamicSMemSize;
 55 
 56     // 检查输入
 57     if (!minGridSize || !blockSize || !func)
 58         return cudaErrorInvalidValue;
 59 
 60     //获取设备和核函数属性
 61     status = ::cudaGetDevice(&device);
 62     if (status != cudaSuccess)
 63         return status;
 64     status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
 65     if (status != cudaSuccess)
 66         return status;
 67     status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
 68     if (status != cudaSuccess)
 69         return status;
 70     status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
 71     if (status != cudaSuccess)
 72         return status;
 73     status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
 74     if (status != cudaSuccess)
 75         return status;
 76     status = cudaFuncGetAttributes(&attr, func);
 77     if (status != cudaSuccess)
 78         return status;
 79 
 80     //尝试线程块尺寸
 81     occupancyLimit = maxThreadsPerMultiProcessor;
 82     granularity = warpSize;
 83 
 84     if (blockSizeLimit == 0 || blockSizeLimit > devMaxThreadsPerBlock)
 85         blockSizeLimit = devMaxThreadsPerBlock;
 86 
 87     if (blockSizeLimit > attr.maxThreadsPerBlock)
 88         blockSizeLimit = attr.maxThreadsPerBlock;
 89 
 90     for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - 1)) / warpSize) * warpSize; blockSizeToTryAligned > 0; blockSizeToTryAligned -= warpSize)
 91         // blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
 92         // 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
 93     {        
 94         blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
 95         dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
 96 
 97         // 计算占用率的核心
 98         status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
 99         if (status != cudaSuccess)
100             return status;
101         
102         // 记录有效结果
103         if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
104         {
105             maxBlockSize = blockSizeToTry;
106             numBlocks = occupancyInBlocks;
107             maxOccupancy = occupancyInThreads;
108         }
109 
110         // 已经达到了占用率 100%,退出
111         if (occupancyLimit == maxOccupancy)
112             break;
113     }
114 
115     // 返回最优结果
116     *minGridSize = numBlocks * multiProcessorCount;
117     *blockSize = maxBlockSize;
118 
119     return status;
120 }
121 
122 class __cudaOccupancyB2DHelper
123 {
124     size_t n;
125     public:
126         inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
127         inline __host__ CUDART_DEVICE size_t operator()(int)
128         {
129             return n; 
130         }
131 };
132 
133 // 优化线程块尺寸的 runtime 函数
134 // 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
135 template<class T>
136 static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
137 (
138     int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = 0, int blockSizeLimit = 0
139 )
140 {
141     return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
142 }

 

posted on 2017-11-25 10:30  爨爨爨好  阅读(685)  评论(0编辑  收藏  举报