爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

利用 CUDA 的 Overlap 特性同时进行运算和数据拷贝来实现加速。

▶ 源代码。使用 4 个流一共执行 10 次 “数据上传 - 内核计算 - 数据下载” 过程,记录使用时间。

  1 #include <stdio.h>
  2 #include <cuda_runtime.h>
  3 #include "device_launch_parameters.h"
  4 #include <helper_cuda.h>
  5 #include <helper_functions.h>
  6 
  7 #define STREAM_COUNT 4
  8 #define NREPS        10
  9 #define INNER_REPS   5
 10 
 11 int N = 1 << 22;
 12 int memsize;
 13 
 14 int *h_data_source;
 15 int *h_data_sink;
 16 int *h_data_in[STREAM_COUNT];
 17 int *h_data_out[STREAM_COUNT];
 18 int *d_data_in[STREAM_COUNT];
 19 int *d_data_out[STREAM_COUNT];
 20 
 21 dim3 grid;
 22 dim3 block(512);
 23 cudaEvent_t cycleDone[STREAM_COUNT], start, stop;
 24 cudaStream_t stream[STREAM_COUNT];
 25 
 26 __global__ void incKernel(int *g_out, int *g_in, int size)
 27 {
 28     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 29 
 30     if (idx < size)
 31     {
 32         for (int i = 0; i < INNER_REPS; ++i)// 暴力重复 5 次,不会被编译器优化掉?
 33             g_out[idx] = g_in[idx] + 1;
 34     }
 35 }
 36 
 37 float processWithStreams(int streams_used)
 38 { 
 39     cudaEventRecord(start, 0);
 40     for (int i = 0, current_stream = 0; i < NREPS; ++i)
 41     {
 42         int next_stream = (current_stream + 1) % streams_used;
 43 
 44 #ifdef SIMULATE_IO//
 45         // 改变下载数据
 46         memcpy(h_data_sink, h_data_out[current_stream], memsize);
 47 
 48         // 改变上传数据
 49         memcpy(h_data_in[next_stream], h_data_source, memsize);
 50 #endif
 51 
 52         // 保证上一次循环中位于流 next_stream 中的任务已经完成
 53         cudaEventSynchronize(cycleDone[next_stream]);
 54 
 55         // 执行当前流的内核
 56         incKernel << <grid, block, 0, stream[current_stream] >> > (d_data_out[current_stream], d_data_in[current_stream], N);
 57 
 58         // 执行下一个流的数据上传
 59         cudaMemcpyAsync(d_data_in[next_stream],h_data_in[next_stream],memsize,cudaMemcpyHostToDevice,stream[next_stream]);
 60 
 61         // 执行当前流的数据下载
 62         cudaMemcpyAsync(h_data_out[current_stream],d_data_out[current_stream],memsize,cudaMemcpyDeviceToHost,stream[current_stream]);
 63 
 64         cudaEventRecord(cycleDone[current_stream],stream[current_stream]);
 65 
 66         current_stream = next_stream;
 67     }
 68     cudaEventRecord(stop, 0);
 69     cudaDeviceSynchronize();
 70     
 71     float time;
 72     cudaEventElapsedTime(&time, start, stop);
 73     return time;
 74 }
 75 
 76 bool test()
 77 {
 78     bool passed = true;
 79     for (int j = 0; j<STREAM_COUNT; ++j)
 80     {
 81         for (int i = 0; i < N; ++i)
 82             passed &= (h_data_out[j][i] == 1);
 83     }
 84     return passed;
 85 }
 86 
 87 int main(int argc, char *argv[])
 88 {
 89     printf("\n\tStart.\n"); 
 90 
 91     // 挑选设备和分析设备性能
 92     int cuda_device = 0;
 93     cudaDeviceProp deviceProp;
 94     
 95     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
 96     {
 97         if ((cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=")) < 0)
 98         {
 99             printf("Invalid command line parameters\n");
100             exit(EXIT_FAILURE);
101         }
102         else
103         {
104             printf("cuda_device = %d\n", cuda_device);
105             if ((cuda_device = gpuDeviceInit(cuda_device)) < 0)
106             {
107                 printf("No CUDA Capable devices found, exiting...\n");
108                 exit(EXIT_SUCCESS);
109             }
110         }
111     }
112     else
113     {
114         cuda_device = gpuGetMaxGflopsDeviceId();
115         cudaSetDevice(cuda_device);
116         cudaGetDeviceProperties(&deviceProp, cuda_device);
117         printf("\n\tDevice [%d]: %s, computation cability %d.%d, ", cuda_device, deviceProp.name, deviceProp.major, deviceProp.minor);
118     }
119     cudaGetDeviceProperties(&deviceProp, cuda_device);
120     printf("%d MP(s) x %d (Cores/MP) = %d (Cores)\n",
121            deviceProp.multiProcessorCount,
122            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),// 依计算能力反应流处理器个数情况,定义于 helper_cuda.h
123            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
124 
125     printf("\n\tRelevant properties of this CUDA device.\n");
126     printf("\t(%c) Execute several GPU kernels simultaneously\n", deviceProp.major >= 2 ? 'Y' : 'N');
127     printf("\t(%c) Overlap one CPU<->GPU data transfer with GPU kernel execution\n", deviceProp.deviceOverlap ? 'Y' : 'N');
128     printf("\t(%c) Overlap two CPU<->GPU data transfers with GPU kernel execution\n",(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1)? 'Y' : 'N');
129 
130     // 如果流处理器个数少于 32,则降低工作负荷
131     float scale_factor = max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)), 1.0f);
132     N = (int)((float)N / scale_factor);
133     printf("\n\tscale_factor = %.2f\n\tarray_size   = %d\n", 1.0f / scale_factor, N);
134 
135     // 准备运行配置
136     memsize = N * sizeof(int);
137     int thread_blocks = N / block.x;
138     grid.x = thread_blocks % 65535;
139     grid.y = (thread_blocks / 65535 + 1);      
140     
141     h_data_source = (int *) malloc(memsize);
142     h_data_sink = (int *) malloc(memsize);
143     for (int i = 0; i < STREAM_COUNT; ++i)
144     {
145         cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); 
146         cudaMalloc(&d_data_in[i], memsize);
147         cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault);
148         cudaMalloc(&d_data_out[i], memsize);
149 
150         cudaStreamCreate(&stream[i]);
151         cudaEventCreate(&cycleDone[i]);
152         cudaEventRecord(cycleDone[i], stream[i]);
153     }
154     cudaEventCreate(&start);
155     cudaEventCreate(&stop);
156 
157     // 初始化 h_data_source 和 h_data_in    
158     for (int i = 0; i<N; ++i)
159         h_data_source[i] = 0;
160     for (int i = 0; i < STREAM_COUNT; ++i)
161         memcpy(h_data_in[i], h_data_source, memsize);
162 
163     // 预跑
164     incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N);
165 
166     // 各种测试
167     cudaEventRecord(start,0);
168     cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0);
169     cudaEventRecord(stop,0);
170     cudaEventSynchronize(stop);
171 
172     float memcpy_h2d_time;
173     cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
174 
175     cudaEventRecord(start,0);
176     cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0);
177     cudaEventRecord(stop,0);
178     cudaEventSynchronize(stop);
179 
180     float memcpy_d2h_time;
181     cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
182 
183     cudaEventRecord(start,0);
184     incKernel<<<grid, block,0,0>>>(d_data_out[0], d_data_in[0], N);
185     cudaEventRecord(stop,0);
186     cudaEventSynchronize(stop);
187 
188     float kernel_time;
189     cudaEventElapsedTime(&kernel_time, start, stop);
190 
191     printf("\n\tMeasured timings (throughput):\n");
192     printf("\tMemcpy host to device:\t%f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
193     printf("\tMemcpy device to host:\t%f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time); 
194     printf("\tKernel:               \t%f ms (%f GB/s)\n", kernel_time, (INNER_REPS *memsize * 2e-6) / kernel_time); 
195 
196     printf("\n\tTheoretical limits for speedup gained from overlapped data transfers:\n"); 
197     printf("\tNo overlap (transfer-kernel-transfer):\t%f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
198     printf("\tOverlap one transfer:                 \t%f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time)); 
199     printf("\tOverlap both data transfers:          \t%f ms\n", max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
200 
201     // 使用 Overlap 特性进行计算
202     float serial_time = processWithStreams(1);
203     float overlap_time = processWithStreams(STREAM_COUNT);
204 
205     printf("\n\tAverage measured timings over %d repetitions:\n", NREPS);
206     printf("\tAvg. time serialized:      \t%f ms (%f GB/s)\n", serial_time / NREPS, (NREPS * (memsize * 2e-6)) / serial_time);
207     printf("\tAvg. time using %d streams:\t%f ms (%f GB/s)\n", STREAM_COUNT, overlap_time / NREPS, (NREPS * (memsize * 2e-6)) / overlap_time);
208     printf("\tAvg. speedup gained:       \t%f ms\n", (serial_time - overlap_time) / NREPS);
209 
210     printf("\n\tResult test: %s.\n", test() ? "Passed" : "Failed");
211 
212     // 回收工作
213     free(h_data_source);
214     free(h_data_sink);
215     for (int i =0; i<STREAM_COUNT; ++i)
216     {
217         cudaFreeHost(h_data_in[i]);
218         cudaFree(d_data_in[i]);
219         cudaFreeHost(h_data_out[i]);
220         cudaFree(d_data_out[i]);
221         cudaStreamDestroy(stream[i]);
222         cudaEventDestroy(cycleDone[i]);
223     }
224     cudaEventDestroy(start);
225     cudaEventDestroy(stop);
226 
227     getchar();
228     return 0;
229 }

▶ 输出结果

    Start.

    Device [0]: GeForce GTX 1070, computation cability 6.1, 16 MP(s) x 128 (Cores/MP) = 2048 (Cores)

    Relevant properties of this CUDA device.
    (Y) Execute several GPU kernels simultaneously
    (Y) Overlap one CPU<->GPU data transfer with GPU kernel execution
    (Y) Overlap two CPU<->GPU data transfers with GPU kernel execution

    scale_factor = 1.00
    array_size   = 4194304

    Measured timings (throughput):
    Memcpy host to device:  1.276192 ms (13.146311 GB/s)
    Memcpy device to host:  1.279008 ms (13.117366 GB/s)
    Kernel:                 1.312768 ms (127.800314 GB/s)

    Theoretical limits for speedup gained from overlapped data transfers:
    No overlap (transfer-kernel-transfer):  3.867968 ms
    Overlap one transfer:                   2.555200 ms
    Overlap both data transfers:            1.312768 ms

    Average measured timings over 10 repetitions:
    Avg. time serialized:           3.992167 ms (8.405068 GB/s)
    Avg. time using 4 streams:      1.896141 ms (17.696171 GB/s)
    Avg. speedup gained:            2.096026 ms

    Result test: Passed.

 

▶ 涨姿势

● 没有

posted on 2017-11-23 11:12  爨爨爨好  阅读(630)  评论(0编辑  收藏  举报