使用多台 GPU 进行计算
▶ 源代码。使用不同的流来控制不同 GPU 上的运算任务。
1 #include <stdio.h> 2 #include <timer.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 #include "simpleMultiGPU.h" 8 9 const int MAX_GPU_COUNT = 32; 10 const int DATA_N = 1048576 * 32; 11 12 __global__ static void reduceKernel(float *d_Result, float *d_Input, int N) 13 { 14 const int tid = blockIdx.x * blockDim.x + threadIdx.x; 15 const int threadN = gridDim.x * blockDim.x; 16 float sum = 0; 17 18 for (int pos = tid; pos < N; pos += threadN) 19 sum += d_Input[pos]; 20 21 d_Result[tid] = sum; 22 } 23 24 int main(int argc, char **argv) 25 { 26 printf("\n\tStart.\n"); 27 28 const int BLOCK_N = 32, THREAD_N = 256; 29 const int ACCUM_N = BLOCK_N * THREAD_N; 30 int i, j, GPU_N; 31 float sumGPU; 32 TGPUplan plan[MAX_GPU_COUNT]; 33 34 cudaGetDeviceCount(&GPU_N); 35 GPU_N = MIN(GPU_N, MAX_GPU_COUNT); 36 printf("\n\tDevice count: %i\n", GPU_N); 37 38 // 准备计算数据 39 for (i = 0; i < GPU_N; i++) 40 plan[i].dataN = DATA_N / GPU_N; 41 42 // 计算数据量与设备数量没有对齐的部分 43 for (i = 0; i < DATA_N % GPU_N; i++) 44 plan[i].dataN++; 45 46 // 申请内存,初始化 h_data 47 for (i = 0; i < GPU_N; i++) 48 { 49 cudaSetDevice(i); 50 cudaStreamCreate(&plan[i].stream); 51 cudaMalloc((void **)&plan[i].d_data, plan[i].dataN * sizeof(float)); 52 cudaMalloc((void **)&plan[i].d_sum, ACCUM_N * sizeof(float)); 53 cudaMallocHost((void **)&plan[i].h_sum_from_device, ACCUM_N * sizeof(float)); 54 cudaMallocHost((void **)&plan[i].h_data, plan[i].dataN * sizeof(float)); 55 56 for (j = 0; j < plan[i].dataN; j++) 57 plan[i].h_data[j] = (float)rand() / (float)RAND_MAX; 58 } 59 60 StartTimer();// 计时 61 62 // 调用各 GPU 进行计算,plan[i].d_data -> plan[i].d_sum -> plan[i].h_sum_from_device 63 for (i = 0; i < GPU_N; i++) 64 { 65 cudaSetDevice(i); 66 cudaMemcpyAsync(plan[i].d_data, plan[i].h_data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream); 67 reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_sum, plan[i].d_data, plan[i].dataN); 68 cudaMemcpyAsync(plan[i].h_sum_from_device, plan[i].d_sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream); 69 } 70 71 // 处理 GPU 计算结果,plan[i].h_sum_from_device -> plan[i].h_sum -> sumGPU 72 for (i = 0; i < GPU_N; i++) 73 { 74 cudaSetDevice(i); 75 cudaStreamSynchronize(plan[i].stream); 76 77 for (j = 0, plan[i].h_sum = 0.0f; j < ACCUM_N; j++) 78 plan[i].h_sum += plan[i].h_sum_from_device[j]; 79 } 80 for (i = 0, sumGPU = 0.0f; i < GPU_N; i++)// CPU 最后规约 81 sumGPU += plan[i].h_sum; 82 printf("\n\tGPU Processing time: %f (ms)\n", GetTimer()); 83 84 // 使用 CPU 计算,plan[i].h_data -> sumCPU 85 double sumCPU = 0; 86 for (i = 0; i < GPU_N; i++) 87 { 88 for (j = 0; j < plan[i].dataN; j++) 89 sumCPU += plan[i].h_data[j]; 90 } 91 92 // 检查结果 93 double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU); 94 printf("\n\tGPU sum: %f\n\tCPU sum: %f\n", sumGPU, sumCPU); 95 printf("\n\tRelative difference: %E, %s\n", diff, (diff < 1e-5) ? "Passed" : "Failed"); 96 97 //回收工作 98 for (i = 0; i < GPU_N; i++) 99 { 100 cudaSetDevice(i); 101 cudaFreeHost(plan[i].h_data); 102 cudaFreeHost(plan[i].h_sum_from_device); 103 cudaFree(plan[i].d_sum); 104 cudaFree(plan[i].d_data); 105 cudaStreamDestroy(plan[i].stream); 106 } 107 108 getchar(); 109 return 0; 110 }
▶ 输出结果
Start. Device count: 1 GPU Processing time: 13.726471 (ms) GPU sum: 16779778.000000 CPU sum: 16779776.312309 Relative difference: 1.005789E-07, Passed
▶ 涨姿势
● 在使用不同的设备执行相关函数(包括 cudaFree 等主机函数)时要注意,使用函数 cudaSetDevice() 来切换设备。