0_Simple__MultiGPU - 爨爨爨好

公告

使用多台 GPU 进行计算

▶ 源代码。使用不同的流来控制不同 GPU 上的运算任务。

  1 #include <stdio.h>
  2 #include <timer.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_functions.h>
  6 #include <helper_cuda.h>
  7 #include "simpleMultiGPU.h"
  8 
  9 const int MAX_GPU_COUNT = 32;
 10 const int DATA_N        = 1048576 * 32;
 11 
 12 __global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
 13 {
 14     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 15     const int threadN = gridDim.x * blockDim.x;
 16     float sum = 0;
 17     
 18     for (int pos = tid; pos < N; pos += threadN)
 19         sum += d_Input[pos];
 20 
 21     d_Result[tid] = sum;
 22 }
 23 
 24 int main(int argc, char **argv)
 25 {
 26     printf("\n\tStart.\n");
 27 
 28     const int BLOCK_N = 32, THREAD_N = 256;
 29     const int ACCUM_N = BLOCK_N * THREAD_N;
 30     int i, j, GPU_N;
 31     float sumGPU;
 32     TGPUplan plan[MAX_GPU_COUNT];
 33     
 34     cudaGetDeviceCount(&GPU_N);
 35     GPU_N = MIN(GPU_N, MAX_GPU_COUNT);
 36     printf("\n\tDevice count: %i\n", GPU_N);
 37 
 38     // 准备计算数据
 39     for (i = 0; i < GPU_N; i++)
 40         plan[i].dataN = DATA_N / GPU_N;
 41 
 42     // 计算数据量与设备数量没有对齐的部分
 43     for (i = 0; i < DATA_N % GPU_N; i++)
 44         plan[i].dataN++;
 45 
 46     // 申请内存，初始化 h_data
 47     for (i = 0; i < GPU_N; i++)
 48     {
 49         cudaSetDevice(i);
 50         cudaStreamCreate(&plan[i].stream);
 51         cudaMalloc((void **)&plan[i].d_data, plan[i].dataN * sizeof(float));
 52         cudaMalloc((void **)&plan[i].d_sum, ACCUM_N * sizeof(float));
 53         cudaMallocHost((void **)&plan[i].h_sum_from_device, ACCUM_N * sizeof(float));
 54         cudaMallocHost((void **)&plan[i].h_data, plan[i].dataN * sizeof(float));
 55 
 56         for (j = 0; j < plan[i].dataN; j++)
 57             plan[i].h_data[j] = (float)rand() / (float)RAND_MAX;
 58     }
 59 
 60     StartTimer();// 计时
 61 
 62     // 调用各 GPU 进行计算，plan[i].d_data -> plan[i].d_sum -> plan[i].h_sum_from_device
 63     for (i = 0; i < GPU_N; i++)
 64     {
 65         cudaSetDevice(i);
 66         cudaMemcpyAsync(plan[i].d_data, plan[i].h_data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream);
 67         reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_sum, plan[i].d_data, plan[i].dataN);
 68         cudaMemcpyAsync(plan[i].h_sum_from_device, plan[i].d_sum, ACCUM_N *sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream);
 69     }
 70 
 71     // 处理 GPU 计算结果，plan[i].h_sum_from_device -> plan[i].h_sum -> sumGPU
 72     for (i = 0; i < GPU_N; i++)
 73     {
 74         cudaSetDevice(i);
 75         cudaStreamSynchronize(plan[i].stream);
 76 
 77         for (j = 0, plan[i].h_sum = 0.0f; j < ACCUM_N; j++)
 78             plan[i].h_sum += plan[i].h_sum_from_device[j];
 79     }
 80     for (i = 0, sumGPU = 0.0f; i < GPU_N; i++)// CPU 最后规约
 81         sumGPU += plan[i].h_sum;
 82     printf("\n\tGPU Processing time: %f (ms)\n", GetTimer());
 83 
 84     // 使用 CPU 计算，plan[i].h_data -> sumCPU
 85     double sumCPU = 0;
 86     for (i = 0; i < GPU_N; i++)
 87     {
 88         for (j = 0; j < plan[i].dataN; j++)
 89             sumCPU += plan[i].h_data[j];
 90     }
 91 
 92     // 检查结果
 93     double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
 94     printf("\n\tGPU sum: %f\n\tCPU sum: %f\n", sumGPU, sumCPU);
 95     printf("\n\tRelative difference: %E, %s\n", diff, (diff < 1e-5) ? "Passed" : "Failed");
 96     
 97     //回收工作
 98     for (i = 0; i < GPU_N; i++)
 99     {
100         cudaSetDevice(i);
101         cudaFreeHost(plan[i].h_data);
102         cudaFreeHost(plan[i].h_sum_from_device);
103         cudaFree(plan[i].d_sum);
104         cudaFree(plan[i].d_data);
105         cudaStreamDestroy(plan[i].stream);
106     }
107 
108     getchar();
109     return 0;
110 }

▶ 输出结果

    Start.

    Device count: 1

    GPU Processing time: 13.726471 (ms)

    GPU sum: 16779778.000000
    CPU sum: 16779776.312309
    Relative difference: 1.005789E-07, Passed

▶ 涨姿势

● 在使用不同的设备执行相关函数（包括 cudaFree 等主机函数）时要注意，使用函数 cudaSetDevice() 来切换设备。

posted on 2017-11-23 12:30 爨爨爨好阅读(342) 评论(0) 编辑收藏举报

刷新页面返回顶部