利用 CUDA 的 Overlap 特性同时进行运算和数据拷贝来实现加速。
▶ 源代码。使用 4 个流一共执行 10 次 “数据上传 - 内核计算 - 数据下载” 过程,记录使用时间。
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 #include <helper_cuda.h> 5 #include <helper_functions.h> 6 7 #define STREAM_COUNT 4 8 #define NREPS 10 9 #define INNER_REPS 5 10 11 int N = 1 << 22; 12 int memsize; 13 14 int *h_data_source; 15 int *h_data_sink; 16 int *h_data_in[STREAM_COUNT]; 17 int *h_data_out[STREAM_COUNT]; 18 int *d_data_in[STREAM_COUNT]; 19 int *d_data_out[STREAM_COUNT]; 20 21 dim3 grid; 22 dim3 block(512); 23 cudaEvent_t cycleDone[STREAM_COUNT], start, stop; 24 cudaStream_t stream[STREAM_COUNT]; 25 26 __global__ void incKernel(int *g_out, int *g_in, int size) 27 { 28 int idx = blockIdx.x * blockDim.x + threadIdx.x; 29 30 if (idx < size) 31 { 32 for (int i = 0; i < INNER_REPS; ++i)// 暴力重复 5 次,不会被编译器优化掉? 33 g_out[idx] = g_in[idx] + 1; 34 } 35 } 36 37 float processWithStreams(int streams_used) 38 { 39 cudaEventRecord(start, 0); 40 for (int i = 0, current_stream = 0; i < NREPS; ++i) 41 { 42 int next_stream = (current_stream + 1) % streams_used; 43 44 #ifdef SIMULATE_IO// ? 45 // 改变下载数据 46 memcpy(h_data_sink, h_data_out[current_stream], memsize); 47 48 // 改变上传数据 49 memcpy(h_data_in[next_stream], h_data_source, memsize); 50 #endif 51 52 // 保证上一次循环中位于流 next_stream 中的任务已经完成 53 cudaEventSynchronize(cycleDone[next_stream]); 54 55 // 执行当前流的内核 56 incKernel << <grid, block, 0, stream[current_stream] >> > (d_data_out[current_stream], d_data_in[current_stream], N); 57 58 // 执行下一个流的数据上传 59 cudaMemcpyAsync(d_data_in[next_stream],h_data_in[next_stream],memsize,cudaMemcpyHostToDevice,stream[next_stream]); 60 61 // 执行当前流的数据下载 62 cudaMemcpyAsync(h_data_out[current_stream],d_data_out[current_stream],memsize,cudaMemcpyDeviceToHost,stream[current_stream]); 63 64 cudaEventRecord(cycleDone[current_stream],stream[current_stream]); 65 66 current_stream = next_stream; 67 } 68 cudaEventRecord(stop, 0); 69 cudaDeviceSynchronize(); 70 71 float time; 72 cudaEventElapsedTime(&time, start, stop); 73 return time; 74 } 75 76 bool test() 77 { 78 bool passed = true; 79 for (int j = 0; j<STREAM_COUNT; ++j) 80 { 81 for (int i = 0; i < N; ++i) 82 passed &= (h_data_out[j][i] == 1); 83 } 84 return passed; 85 } 86 87 int main(int argc, char *argv[]) 88 { 89 printf("\n\tStart.\n"); 90 91 // 挑选设备和分析设备性能 92 int cuda_device = 0; 93 cudaDeviceProp deviceProp; 94 95 if (checkCmdLineFlag(argc, (const char **)argv, "device")) 96 { 97 if ((cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=")) < 0) 98 { 99 printf("Invalid command line parameters\n"); 100 exit(EXIT_FAILURE); 101 } 102 else 103 { 104 printf("cuda_device = %d\n", cuda_device); 105 if ((cuda_device = gpuDeviceInit(cuda_device)) < 0) 106 { 107 printf("No CUDA Capable devices found, exiting...\n"); 108 exit(EXIT_SUCCESS); 109 } 110 } 111 } 112 else 113 { 114 cuda_device = gpuGetMaxGflopsDeviceId(); 115 cudaSetDevice(cuda_device); 116 cudaGetDeviceProperties(&deviceProp, cuda_device); 117 printf("\n\tDevice [%d]: %s, computation cability %d.%d, ", cuda_device, deviceProp.name, deviceProp.major, deviceProp.minor); 118 } 119 cudaGetDeviceProperties(&deviceProp, cuda_device); 120 printf("%d MP(s) x %d (Cores/MP) = %d (Cores)\n", 121 deviceProp.multiProcessorCount, 122 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),// 依计算能力反应流处理器个数情况,定义于 helper_cuda.h 123 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); 124 125 printf("\n\tRelevant properties of this CUDA device.\n"); 126 printf("\t(%c) Execute several GPU kernels simultaneously\n", deviceProp.major >= 2 ? 'Y' : 'N'); 127 printf("\t(%c) Overlap one CPU<->GPU data transfer with GPU kernel execution\n", deviceProp.deviceOverlap ? 'Y' : 'N'); 128 printf("\t(%c) Overlap two CPU<->GPU data transfers with GPU kernel execution\n",(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)? 'Y' : 'N'); 129 130 // 如果流处理器个数少于 32,则降低工作负荷 131 float scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f); 132 N = (int)((float)N / scale_factor); 133 printf("\n\tscale_factor = %.2f\n\tarray_size = %d\n", 1.0f / scale_factor, N); 134 135 // 准备运行配置 136 memsize = N * sizeof(int); 137 int thread_blocks = N / block.x; 138 grid.x = thread_blocks % 65535; 139 grid.y = (thread_blocks / 65535 + 1); 140 141 h_data_source = (int *) malloc(memsize); 142 h_data_sink = (int *) malloc(memsize); 143 for (int i = 0; i < STREAM_COUNT; ++i) 144 { 145 cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); 146 cudaMalloc(&d_data_in[i], memsize); 147 cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault); 148 cudaMalloc(&d_data_out[i], memsize); 149 150 cudaStreamCreate(&stream[i]); 151 cudaEventCreate(&cycleDone[i]); 152 cudaEventRecord(cycleDone[i], stream[i]); 153 } 154 cudaEventCreate(&start); 155 cudaEventCreate(&stop); 156 157 // 初始化 h_data_source 和 h_data_in 158 for (int i = 0; i<N; ++i) 159 h_data_source[i] = 0; 160 for (int i = 0; i < STREAM_COUNT; ++i) 161 memcpy(h_data_in[i], h_data_source, memsize); 162 163 // 预跑 164 incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N); 165 166 // 各种测试 167 cudaEventRecord(start,0); 168 cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0); 169 cudaEventRecord(stop,0); 170 cudaEventSynchronize(stop); 171 172 float memcpy_h2d_time; 173 cudaEventElapsedTime(&memcpy_h2d_time, start, stop); 174 175 cudaEventRecord(start,0); 176 cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0); 177 cudaEventRecord(stop,0); 178 cudaEventSynchronize(stop); 179 180 float memcpy_d2h_time; 181 cudaEventElapsedTime(&memcpy_d2h_time, start, stop); 182 183 cudaEventRecord(start,0); 184 incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N); 185 cudaEventRecord(stop,0); 186 cudaEventSynchronize(stop); 187 188 float kernel_time; 189 cudaEventElapsedTime(&kernel_time, start, stop); 190 191 printf("\n\tMeasured timings (throughput):\n"); 192 printf("\tMemcpy host to device:\t%f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time); 193 printf("\tMemcpy device to host:\t%f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time); 194 printf("\tKernel: \t%f ms (%f GB/s)\n", kernel_time, (INNER_REPS *memsize * 2e-6) / kernel_time); 195 196 printf("\n\tTheoretical limits for speedup gained from overlapped data transfers:\n"); 197 printf("\tNo overlap (transfer-kernel-transfer):\t%f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time); 198 printf("\tOverlap one transfer: \t%f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); 199 printf("\tOverlap both data transfers: \t%f ms\n", max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time)); 200 201 // 使用 Overlap 特性进行计算 202 float serial_time = processWithStreams(1); 203 float overlap_time = processWithStreams(STREAM_COUNT); 204 205 printf("\n\tAverage measured timings over %d repetitions:\n", NREPS); 206 printf("\tAvg. time serialized: \t%f ms (%f GB/s)\n", serial_time / NREPS, (NREPS * (memsize * 2e-6)) / serial_time); 207 printf("\tAvg. time using %d streams:\t%f ms (%f GB/s)\n", STREAM_COUNT, overlap_time / NREPS, (NREPS * (memsize * 2e-6)) / overlap_time); 208 printf("\tAvg. speedup gained: \t%f ms\n", (serial_time - overlap_time) / NREPS); 209 210 printf("\n\tResult test: %s.\n", test() ? "Passed" : "Failed"); 211 212 // 回收工作 213 free(h_data_source); 214 free(h_data_sink); 215 for (int i =0; i<STREAM_COUNT; ++i) 216 { 217 cudaFreeHost(h_data_in[i]); 218 cudaFree(d_data_in[i]); 219 cudaFreeHost(h_data_out[i]); 220 cudaFree(d_data_out[i]); 221 cudaStreamDestroy(stream[i]); 222 cudaEventDestroy(cycleDone[i]); 223 } 224 cudaEventDestroy(start); 225 cudaEventDestroy(stop); 226 227 getchar(); 228 return 0; 229 }
▶ 输出结果
Start. Device [0]: GeForce GTX 1070, computation cability 6.1, 16 MP(s) x 128 (Cores/MP) = 2048 (Cores) Relevant properties of this CUDA device. (Y) Execute several GPU kernels simultaneously (Y) Overlap one CPU<->GPU data transfer with GPU kernel execution (Y) Overlap two CPU<->GPU data transfers with GPU kernel execution scale_factor = 1.00 array_size = 4194304 Measured timings (throughput): Memcpy host to device: 1.276192 ms (13.146311 GB/s) Memcpy device to host: 1.279008 ms (13.117366 GB/s) Kernel: 1.312768 ms (127.800314 GB/s) Theoretical limits for speedup gained from overlapped data transfers: No overlap (transfer-kernel-transfer): 3.867968 ms Overlap one transfer: 2.555200 ms Overlap both data transfers: 1.312768 ms Average measured timings over 10 repetitions: Avg. time serialized: 3.992167 ms (8.405068 GB/s) Avg. time using 4 streams: 1.896141 ms (17.696171 GB/s) Avg. speedup gained: 2.096026 ms Result test: Passed.
▶ 涨姿势
● 没有