爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

使用三种模式(QUICK_MODE,RANGE_MODE,SHMOO_MODE),测试三种拷贝情况下的显存带宽(HostToDevice,DeviceToHost,DeviceToDevice)

▶ 源代码

  1 #include <iostream>
  2 #include <cuda_runtime.h>
  3 #include "device_launch_parameters.h"
  4 #include <cuda.h>
  5 #include <helper_cuda.h>       
  6 #include <helper_functions.h> 
  7 
  8 #define MEMCOPY_ITERATIONS      100
  9 #define DEFAULT_SIZE            ( 32 * ( 1 << 20 ) )//32 M
 10 #define DEFAULT_INCREMENT       (1 << 22)           //4 M
 11 #define CACHE_CLEAR_SIZE        (1 << 24)           //16 M
 12 #define SHMOO_MEMSIZE_MAX       (1 << 26)           //64 M
 13 #define SHMOO_MEMSIZE_START     (1 << 10)           //1 KB
 14 #define SHMOO_INCREMENT_1KB     (1 << 10)           //1 KB
 15 #define SHMOO_INCREMENT_2KB     (1 << 11)           //2 KB
 16 #define SHMOO_INCREMENT_10KB    (10 * (1 << 10))    //10KB
 17 #define SHMOO_INCREMENT_100KB   (100 * (1 << 10))   //100 KB
 18 #define SHMOO_INCREMENT_1MB     (1 << 20)           //1 MB
 19 #define SHMOO_INCREMENT_2MB     (1 << 21)           //2 MB
 20 #define SHMOO_INCREMENT_4MB     (1 << 22)           //4 MB
 21 #define SHMOO_LIMIT_20KB        (20 * (1 << 10))    //20 KB
 22 #define SHMOO_LIMIT_50KB        (50 * (1 << 10))    //50 KB
 23 #define SHMOO_LIMIT_100KB       (100 * (1 << 10))   //100 KB
 24 #define SHMOO_LIMIT_1MB         (1 << 20)           //1 MB
 25 #define SHMOO_LIMIT_16MB        (1 << 24)           //16 MB
 26 #define SHMOO_LIMIT_32MB        (1 << 25)           //32 MB
 27 #define DEBUG
 28 
 29 static bool bDontUseGPUTiming;
 30 const char *sMemoryCopyKind[] = { "Device to Host", "Host to Device", "Device to Device", NULL };
 31 const char *sMemoryMode[] = { "PINNED", "PAGEABLE", NULL };
 32 enum testMode   { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
 33 enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
 34 enum printMode  { USER_READABLE, CSV };
 35 enum memoryMode { PINNED, PAGEABLE };
 36 
 37 void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs)
 38 {
 39     int i;
 40     printf("\n%s Bandwidth, %i Device(s), %s Memory Transfers\n", sMemoryCopyKind[kind], iNumDevs, sMemoryMode[memMode]);
 41     printf("    Transfer Size: Bytes\tBandwidth: MB/s\n");
 42     for (i = 0; i < count; i++)
 43         printf("    %u\t\t\t%.1f\n", memSizes[i], bandwidths[i]);
 44 }
 45 
 46 void printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc)
 47 {
 48     std::string sConfig;
 49     if (kind == DEVICE_TO_DEVICE)
 50         sConfig += "-D2D";
 51     else
 52     {
 53         if (kind == DEVICE_TO_HOST)
 54             sConfig += "-D2H";
 55         else if (kind == HOST_TO_DEVICE)
 56             sConfig += "-H2D";
 57 
 58         if (memMode == PAGEABLE)
 59             sConfig += "-Paged";
 60         else if (memMode == PINNED)
 61         {
 62             sConfig += "-Pinned";
 63             if (wc)
 64                 sConfig += "-WriteCombined";
 65         }
 66     }
 67     for (int i = 0; i < count; i++)
 68         printf("BandwidthTest %s, Bandwidth = %.1f MB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n",
 69             sConfig.c_str(), bandwidths[i], (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20)), memSizes[i], iNumDevs);
 70 }
 71 
 72 void printHelp(void)
 73 {
 74     printf("Usage:  bandwidthTest [OPTION]...\n");
 75     printf("Test the bandwidth for device to host, host to device, and device to device transfers\n");
 76     printf("\n");
 77     printf("Example:  measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
 78     printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n");
 79     printf("\n");
 80     printf("Options:\n");
 81     printf("--help\tDisplay this help menu\n");
 82     printf("--csv\tPrint results as a CSV\n");
 83     printf("--device=[deviceno]\tSpecify the device device to be used\n");
 84     printf("  all - compute cumulative bandwidth on all the devices\n");
 85     printf("  0,1,2,...,n - Specify any particular device to be used\n");
 86     printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
 87     printf("  pageable - pageable memory\n");
 88     printf("  pinned   - non-pageable system memory\n");
 89     printf("--mode=[MODE]\tSpecify the mode to use\n");
 90     printf("  quick - performs a quick measurement\n");
 91     printf("  range - measures a user-specified range of values\n");
 92     printf("  shmoo - performs an intense shmoo of a large range of values\n");
 93     printf("--htod\tMeasure host to device transfers\n");
 94     printf("--dtoh\tMeasure device to host transfers\n");
 95     printf("--dtod\tMeasure device to device transfers\n");
 96 #if CUDART_VERSION >= 2020
 97     printf("--wc\tAllocate pinned memory as write-combined\n");
 98 #endif
 99     printf("--cputiming\tForce CPU-based timing always\n");
100     printf("Range mode options\n");
101     printf("--start=[SIZE]\tStarting transfer size in bytes\n");
102     printf("--end=[SIZE]\tEnding transfer size in bytes\n");
103     printf("--increment=[SIZE]\tIncrement size in bytes\n");
104     return;
105 }
106 
107 float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
108 {
109     float elapsedTime = 0.0f;
110     StopWatchInterface *timer = NULL;
111     sdkCreateTimer(&timer);
112     cudaEvent_t start, stop;
113     cudaEventCreate(&start);
114     cudaEventCreate(&stop); 
115 
116     unsigned char *h_data, *d_data;
117     if (memMode == PINNED)// 使用页锁定内存或者可分页内存
118     {
119         if (CUDART_VERSION >= 2020)
120         {
121             cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : 0);
122             cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : 0);
123         }
124         else
125         {
126             cudaMallocHost((void **)&h_data, memSize);
127             cudaMallocHost((void **)&h_data, memSize);
128         }
129     }
130     else
131     {
132         h_data = (unsigned char *)malloc(memSize);// 先放点东西到设备内存中,在收回的时候测试时间
133         if (h_data == NULL || h_data == NULL)
134         {
135             fprintf(stderr, "\nNo host memory to run testDeviceToHostTransfer\n");
136             exit(EXIT_FAILURE);
137         }
138     }
139     for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++)
140         h_data[i] = (unsigned char)(i & 0xff);
141     cudaMalloc((void **)&d_data, memSize);
142     cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice);
143 
144     sdkStartTimer(&timer);
145     cudaEventRecord(start, 0);
146     if (memMode == PINNED)
147     {
148         for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
149             cudaMemcpyAsync(h_data, d_data, memSize, cudaMemcpyDeviceToHost, 0);        
150     }
151     else
152     {
153         for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
154             cudaMemcpy(h_data, d_data, memSize,cudaMemcpyDeviceToHost);
155     }
156     cudaEventRecord(stop, 0);
157     cudaDeviceSynchronize();
158     sdkStopTimer(&timer);
159     cudaEventElapsedTime(&elapsedTime, start, stop);
160     if (memMode != PINNED || bDontUseGPUTiming)// 不使用页锁定内存时只能使用 sdkGetTimerValue() 来计时 
161         elapsedTime = sdkGetTimerValue(&timer);
162 
163     sdkDeleteTimer(&timer);
164     cudaEventDestroy(start);
165     cudaEventDestroy(stop);
166     if (memMode == PINNED)
167         cudaFreeHost(h_data);
168     else
169         free(h_data);
170     cudaFree(d_data);
171     return ((float)(1 << 10) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)(1 << 20));
172 }
173 
174 float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
175 {
176     float elapsedTime = 0.0f;
177     StopWatchInterface *timer = NULL;
178     sdkCreateTimer(&timer);
179     cudaEvent_t start, stop;
180     cudaEventCreate(&start);
181     cudaEventCreate(&stop);
182 
183     unsigned char *h_data, *d_data,*h_cacheClear1,*h_cacheClear2;
184     if (PINNED == memMode)
185     {
186         if(CUDART_VERSION >= 2020)
187             cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : 0); 
188         else
189             cudaMallocHost((void **)&h_data, memSize);
190     }
191     else
192     {
193         h_data = (unsigned char *)malloc(memSize);
194         if (h_data == NULL)
195         {
196             fprintf(stderr, "\nNo host memory to run testHostToDeviceTransfer\n");
197             exit(EXIT_FAILURE);
198         }
199     }
200     for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++)
201         h_data[i] = (unsigned char)(i & 0xff);
202     h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);// 占位内存?
203     h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
204     if (h_cacheClear1 == NULL || h_cacheClear2 == NULL)
205     {
206         fprintf(stderr, "\nNo host memory to run testHostToDeviceTransfer\n");
207         exit(EXIT_FAILURE);
208     }
209     for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++)
210     {
211         h_cacheClear1[i] = (unsigned char)(i & 0xff);
212         h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
213     }
214     cudaMalloc((void **)&d_data, memSize);
215 
216     sdkStartTimer(&timer);
217     cudaEventRecord(start, 0);
218     if (memMode == PINNED)
219     {
220         for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
221             cudaMemcpyAsync(d_data, h_data, memSize,cudaMemcpyHostToDevice, 0);
222     }
223     else
224     {
225         for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
226             cudaMemcpy(d_data, h_data, memSize,cudaMemcpyHostToDevice);
227     }
228     cudaEventRecord(stop, 0);
229     cudaDeviceSynchronize();
230     sdkStopTimer(&timer);
231     cudaEventElapsedTime(&elapsedTime, start, stop);
232     if (memMode != PINNED || bDontUseGPUTiming)
233         elapsedTime = sdkGetTimerValue(&timer);
234 
235     sdkDeleteTimer(&timer);
236     cudaEventDestroy(start);
237     cudaEventDestroy(stop);
238     if (PINNED == memMode)
239         cudaFreeHost(h_data);
240     else
241         free(h_data);
242     free(h_cacheClear1);
243     free(h_cacheClear2);
244     cudaFree(d_data);
245     return ((float)(1 << 10) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)(1 << 20));;
246 }
247 
248 float testDeviceToDeviceTransfer(unsigned int memSize)
249 {
250     float elapsedTime = 0.0f;
251     StopWatchInterface *timer = NULL;
252     sdkCreateTimer(&timer);
253     cudaEvent_t start, stop;
254     cudaEventCreate(&start);
255     cudaEventCreate(&stop);
256 
257     unsigned char *h_data, *d_idata, *d_odata;
258     h_data = (unsigned char *)malloc(memSize);
259     if (h_data == 0)
260     {
261         fprintf(stderr, "\nNo host memory to run testDeviceToDeviceTransfer\n");
262         exit(EXIT_FAILURE);
263     }
264     cudaMalloc((void **)&d_idata, memSize);
265     cudaMalloc((void **)&d_odata, memSize);
266     for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++)
267         h_data[i] = (unsigned char)(i & 0xff);
268     cudaMemcpy(d_idata, h_data, memSize, cudaMemcpyHostToDevice);
269 
270     sdkStartTimer(&timer);
271     cudaEventRecord(start, 0);
272     for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
273         cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice);
274     cudaEventRecord(stop, 0);
275     cudaDeviceSynchronize();
276     sdkStopTimer(&timer);
277     cudaEventElapsedTime(&elapsedTime, start, stop);
278     if (bDontUseGPUTiming)
279         elapsedTime = sdkGetTimerValue(&timer);
280 
281     sdkDeleteTimer(&timer);
282     cudaEventDestroy(stop);
283     cudaEventDestroy(start);
284     free(h_data);
285     cudaFree(d_idata);
286     cudaFree(d_odata);
287     return 2.0f * ((float)(1 << 10) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)(1 << 20));
288 }
289 
290 void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
291 {
292     int i;
293     unsigned int count = 1 + ((end - start) / increment);
294     unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
295     double *bandwidths = (double *)malloc(count * sizeof(double));
296     for (i = 0; i < count; i++)
297         bandwidths[i] = 0.0;
298     // 逐设备测试
299     for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
300     {
301         cudaSetDevice(currentDevice);
302         for (i = 0; i < count; i++)
303         {
304             memSizes[i] = start + i * increment;
305             switch (kind)
306             {
307             case DEVICE_TO_HOST:
308                 bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
309                 break;
310             case HOST_TO_DEVICE:
311                 bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
312                 break;
313             case DEVICE_TO_DEVICE:
314                 bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
315                 break;
316             }
317         }
318     }
319 
320     if (printmode == CSV)
321         printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
322     else
323         printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice));
324     free(memSizes);
325     free(bandwidths);
326     return;
327 }
328 
329 void testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
330 {
331     int i;
332     unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
333         + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
334         + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
335         + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
336         + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
337         + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
338         + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
339     unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
340     double *bandwidths = (double *)malloc(count * sizeof(double));
341     for (i = 0; i < count; i++)
342         bandwidths[i] = 0.0;
343     // 逐设备测试
344     for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)
345     {
346         printf("\n"); 
347         cudaSetDevice(currentDevice);
348         for (unsigned int memSize = 0, i = 0; memSize <= SHMOO_MEMSIZE_MAX; i++)
349         {
350             if (memSize < SHMOO_LIMIT_20KB)
351                 memSize += SHMOO_INCREMENT_1KB;
352             else if (memSize < SHMOO_LIMIT_50KB)
353                 memSize += SHMOO_INCREMENT_2KB;
354             else if (memSize < SHMOO_LIMIT_100KB)
355                 memSize += SHMOO_INCREMENT_10KB;
356             else if (memSize < SHMOO_LIMIT_1MB)
357                 memSize += SHMOO_INCREMENT_100KB;
358             else if (memSize < SHMOO_LIMIT_16MB)
359                 memSize += SHMOO_INCREMENT_1MB;
360             else if (memSize < SHMOO_LIMIT_32MB)
361                 memSize += SHMOO_INCREMENT_2MB;
362             else
363                 memSize += SHMOO_INCREMENT_4MB;
364             memSizes[i] = memSize;
365             switch (kind)
366             {
367             case DEVICE_TO_HOST:
368                 bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
369                 break;
370             case HOST_TO_DEVICE:
371                 bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
372                 break;
373             case DEVICE_TO_DEVICE:
374                 bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
375                 break;
376             }
377             printf(".");
378         }
379     }
380 
381     if (CSV == printmode)
382         printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
383     else
384         printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice));
385     free(memSizes);
386     free(bandwidths);
387     return;
388 }
389 
390 void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc)
391 {
392     switch (mode)
393     {
394         case QUICK_MODE:
395             testBandwidthRange(DEFAULT_SIZE, DEFAULT_SIZE, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
396             break;
397         case RANGE_MODE:
398             testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
399             break;
400         case SHMOO_MODE:
401             testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
402             break;
403         default:
404             break;
405     }
406 }
407 
408 bool test(const int argc, const char **argv)
409 {
410     // 处理命令行参数
411     // 帮助模式,计时器,输出方式
412     if (checkCmdLineFlag(argc, argv, "help"))
413     {
414         printHelp();
415         return 0;
416     }
417     if (checkCmdLineFlag(argc, argv, "cputiming"))
418         bDontUseGPUTiming = true;
419     printMode printmode = USER_READABLE;
420     if (checkCmdLineFlag(argc, argv, "csv"))
421         printmode = CSV;
422     
423     // 内存模式,默认使用页锁定内存
424     memoryMode memMode = PINNED;
425     char *memModeStr;
426     if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr))
427     {
428         if (strcmp(memModeStr, "pageable") == 0)
429             memMode = PAGEABLE;
430         else if (strcmp(memModeStr, "pinned") == 0)
431             memMode = PINNED;
432         else
433         {
434             printf("\nInvalid memory mode - valid modes are pageable or pinned\n");
435             printf("\nSee --help for more information\n");
436             return false;
437         }
438     }
439     else
440         memMode = PINNED;
441     // 计算能力 2.2 以上,可选 cudaHostAllocWriteCombined 模式
442     bool wc = (CUDART_VERSION >= 2020 && checkCmdLineFlag(argc, argv, "wc")) ? true : false;
443     
444     // 设备
445     int startDevice = 0, endDevice = 0;
446     char *device;
447     if (getCmdLineArgumentString(argc, argv, "device", &device))
448     {
449         int deviceCount;
450         cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
451         if (error_id != cudaSuccess)
452         {
453             printf("\ncudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
454             exit(EXIT_FAILURE);
455         }
456         if (deviceCount == 0)
457         {
458             printf("\nNo devices found\n");
459             return false;
460         }
461         if (strcmp(device, "all") == 0)
462         {
463             printf("\nCumulative Bandwidth to be computed from all the devices\n");
464             startDevice = 0;
465             endDevice = deviceCount - 1;
466         }
467         else
468         {
469             startDevice = endDevice = atoi(device);
470             if (startDevice > deviceCount || startDevice < 0)
471             {
472                 printf("\nInvalid GPU number %d given hence default gpu 0 will be used\n", startDevice);
473                 startDevice = endDevice = 0;
474             }
475         }
476     }
477     for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)// 初始化设备
478     {
479         cudaDeviceProp deviceProp;
480         cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
481         if (error_id == cudaSuccess)
482         {
483             printf("  Device %d: %s\n", currentDevice, deviceProp.name);
484             if (deviceProp.computeMode == cudaComputeModeProhibited)
485             {
486                 fprintf(stderr, "\nError: device is running in <Compute Mode Prohibited>\n");
487                 cudaSetDevice(currentDevice);
488                 exit(EXIT_FAILURE);
489             }
490         }
491         else
492         {
493             printf("\ncudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
494             cudaSetDevice(currentDevice);
495             exit(EXIT_FAILURE);
496         }
497     }
498     
499     // 测试内容
500     bool htod = checkCmdLineFlag(argc, argv, "htod") ? true : false;
501     bool dtoh = checkCmdLineFlag(argc, argv, "dtoh") ? true : false;
502     bool dtod = checkCmdLineFlag(argc, argv, "dtod") ? true : false; 
503     // 测试模式,默认 quick 模式
504     testMode mode;
505     char *modeStr;
506     if (getCmdLineArgumentString(argc, argv, "mode", &modeStr))
507     {
508         if (strcmp(modeStr, "quick") == 0)
509             mode = QUICK_MODE;
510         else if (strcmp(modeStr, "shmoo") == 0)
511             mode = SHMOO_MODE;
512         else if (strcmp(modeStr, "range") == 0)
513             mode = RANGE_MODE;
514         else
515         {
516             printf("\nInvalid mode - valid modes are quick, range, or shmoo\n");
517             return false;
518         }
519     }
520     else
521         mode = QUICK_MODE;
522     int startSize, endSize, increment;
523     if (mode == RANGE_MODE)// range 模式需要给出最小和最大尺寸
524     {
525         if (checkCmdLineFlag(argc, (const char **)argv, "startSize"))
526         {
527             if ((startSize = getCmdLineArgumentInt(argc, argv, "startSize")) <= 0)
528             {
529                 printf("\nIllegal argument - startSize must be greater than zero\n");
530                 return false;
531             }
532         }
533         else
534         {
535             printf("\nMust specify a starting size in range mode\n");
536             return false;
537         }
538         if (checkCmdLineFlag(argc, (const char **)argv, "endSize"))
539         {
540             if ((endSize = getCmdLineArgumentInt(argc, argv, "endSize")) <= 0)
541             {
542                 printf("\nIllegal argument - endSize must be greater than zero\n");
543                 return false;
544             }
545             if (startSize > endSize)
546             {
547                 printf("\nIllegal argument - startSize is greater than endSize\n");
548                 return false;
549             }
550         }
551         else
552         {
553             printf("\nMust specify an endSize size in range mode.\n");
554             return false;
555         }
556         if (checkCmdLineFlag(argc, argv, "increment"))
557         {
558             if ((increment = getCmdLineArgumentInt(argc, argv, "increment")) <= 0)
559             {
560                 printf("\nIllegal argument - increment must be greater than zero\n");
561                 return false;
562             }
563         }
564         else
565         {
566             printf("\nMust specify an increment in user mode\n");
567             return false;
568         }
569     }
570     else
571         startSize = endSize = increment = DEFAULT_SIZE;
572     
573 #ifdef DEBUG
574     htod = dtoh = dtod = true;
575     mode = SHMOO_MODE;
576     wc = true;
577 #endif     
578     // 运行测试
579     printf("  %s, %s cudaHostAllocWriteCombined", (mode == QUICK_MODE) ? "QUICK_MODE" : ((mode == RANGE_MODE) ? "RANGE_MODE" : "SHMOO_MODE"), wc ? "enable" : "disable");
580     if (htod)
581         testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc); 
582     if (dtoh)
583         testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc);
584     if (dtod)
585         testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
586     for (int nDevice = startDevice; nDevice <= endDevice; nDevice++)// 多设备情况下需要逐一设备同步
587         cudaSetDevice(nDevice);
588     return true;
589 }
590 
591 int main(int argc, char **argv)
592 {
593     printf("Start.\n");
594     printf("Finish: %s.\n", test(argc, (const char **)argv)? "Result = PASS" : "Result = Fail");
595     getchar();
596     return 0;
597 }

▶ 输出结果:QUICK_MODE,是否写入合并都尝试了。

Start.
  Device 0: GeForce GTX 1070
  QUICK_MODE, enable cudaHostAllocWriteCombined
Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    12014.1

Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    12780.3

Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    185140.2

Finish: Result = PASS.
Start.
  Device 0: GeForce GTX 1070
  QUICK_MODE, disable cudaHostAllocWriteCombined
Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    12369.4

Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    12741.7

Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    185244.1

Finish: Result = PASS.
Start.
  Device 0: GeForce GTX 1070
  QUICK_MODE, disenable cudaHostAllocWriteCombined    // 使用老版本的函数 cudaMallocHost()
Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    12205.4

Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    12814.8

Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    33554432                    187538.8
Finish: Result = PASS.

 

 

 

 

▶ 输出结果:SHMOO_MODE(RANGE_MODE 模式需要给 .exe 额外参数,这里没有测试)

Start.
  Device 0: GeForce GTX 1070
  SHMOO_MODE,  enable  cudaHostAllocWriteCombined
.................................................................................
Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    1024                        728.8
    2048                        1319.7
    3072                        1797.7
    4096                        2441.4
    5120                        2921.7
    6144                        3386.9
    7168                        3776.8
    8192                        4027.1
    9216                        4125.7
    10240                       4672.5
    11264                       5063.3
    12288                       5302.6
    13312                       5568.9
    14336                       5814.7
    15360                       6100.3
    16384                       6056.2
    17408                       6459.0
    18432                       6734.9
    19456                       6510.4
    20480                       6950.6
    22528                       7209.5
    24576                       7585.0
    26624                       7812.5
    28672                       7948.8
    30720                       8048.6
    32768                       8137.4
    34816                       8599.8
    36864                       8616.7
    38912                       8899.1
    40960                       9084.3
    43008                       9152.1
    45056                       9276.1
    47104                       9319.9
    49152                       9356.9
    51200                       9536.7
    61440                       10097.5
    71680                       10357.5
    81920                       10209.5
    92160                       10612.0
    102400                      10923.5
    204800                      11632.7
    307200                      12065.2
    409600                      12157.0
    512000                      12240.7
    614400                      12227.5
    716800                      12314.8
    819200                      12379.2
    921600                      12349.4
    1024000                     12126.7
    1126400                     12405.8
    2174976                     12254.6
    3223552                     12200.2
    4272128                     12343.5
    5320704                     12267.2
    6369280                     12405.5
    7417856                     12360.6
    8466432                     12376.7
    9515008                     12435.5
    10563584                    12441.0
    11612160                    12390.6
    12660736                    12354.7
    13709312                    12560.6
    14757888                    12540.4
    15806464                    12414.0
    16855040                    12387.1
    18952192                    12436.0
    21049344                    12455.3
    23146496                    12458.1
    25243648                    12438.6
    27340800                    12594.3
    29437952                    12459.4
    31535104                    12468.8
    33632256                    12464.1
    37826560                    12477.6
    42020864                    12609.3
    46215168                    12509.2
    50409472                    12511.1
    54603776                    12510.9
    58798080                    12544.5
    62992384                    12536.1
    67186688                    12589.3

.................................................................................
Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    1024                        1190.9
    2048                        2164.1
    3072                        2989.5
    4096                        3792.5
    5120                        4398.9
    6144                        4882.8
    7168                        5340.6
    8192                        5908.8
    9216                        6189.5
    10240                       6341.3
    11264                       6975.4
    12288                       7233.8
    13312                       7381.0
    14336                       7724.2
    15360                       7744.1
    16384                       8132.7
    17408                       8178.1
    18432                       8410.6
    19456                       8433.9
    20480                       8719.3
    22528                       9027.0
    24576                       9155.3
    26624                       9474.1
    28672                       9205.7
    30720                       9390.0
    32768                       9701.2
    34816                       9911.4
    36864                       10044.6
    38912                       10271.6
    40960                       10444.5
    43008                       10456.5
    45056                       10480.2
    47104                       10645.0
    49152                       10698.2
    51200                       10847.7
    61440                       11068.5
    71680                       11336.5
    81920                       11522.3
    92160                       11564.6
    102400                      11765.8
    204800                      12276.1
    307200                      12477.4
    409600                      12548.2
    512000                      12626.9
    614400                      12685.4
    716800                      12680.3
    819200                      12572.4
    921600                      12735.9
    1024000                     12746.6
    1126400                     12612.6
    2174976                     12590.1
    3223552                     12686.1
    4272128                     12613.7
    5320704                     12698.9
    6369280                     12744.6
    7417856                     12671.2
    8466432                     12725.5
    9515008                     12656.7
    10563584                    12748.5
    11612160                    12679.3
    12660736                    12633.1
    13709312                    12699.6
    14757888                    12742.6
    15806464                    12743.1
    16855040                    12778.5
    18952192                    12853.2
    21049344                    12720.3
    23146496                    12687.7
    25243648                    12723.4
    27340800                    12727.3
    29437952                    12735.1
    31535104                    12781.9
    33632256                    12728.6
    37826560                    12698.2
    42020864                    12741.9
    46215168                    12740.8
    50409472                    12727.9
    54603776                    12732.4
    58798080                    12732.0
    62992384                    12746.5
    67186688                    12721.2

.................................................................................
Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers
    Transfer Size: Bytes        Bandwidth: MB/s
    1024                        996.5
    2048                        1606.9
    3072                        2381.9
    4096                        3112.5
    5120                        3875.2
    6144                        4455.8
    7168                        5314.0
    8192                        5986.6
    9216                        6709.2
    10240                       7414.0
    11264                       8016.6
    12288                       8647.5
    13312                       9300.6
    14336                       9943.2
    15360                       9765.6
    16384                       13329.8
    17408                       13697.3
    18432                       6595.5
    19456                       16327.5
    20480                       16054.5
    22528                       17978.6
    24576                       19574.6
    26624                       20897.6
    28672                       22786.5
    30720                       24414.1
    32768                       25933.6
    34816                       27669.3
    36864                       29377.2
    38912                       30542.7
    40960                       33235.8
    43008                       35511.4
    45056                       35319.8
    47104                       37231.3
    49152                       39635.4
    51200                       40521.3
    61440                       39324.7
    71680                       69499.6
    81920                       70531.8
    92160                       77779.3
    102400                      85277.7
    204800                      162211.3
    307200                      204873.3
    409600                      258718.8
    512000                      274195.0
    614400                      309201.8
    716800                      261475.0
    819200                      362004.1
    921600                      391685.8
    1024000                     362360.9
    1126400                     155871.0
    2174976                     174356.8
    3223552                     179414.9
    4272128                     182220.3
    5320704                     170005.7
    6369280                     179022.1
    7417856                     177974.8
    8466432                     177474.9
    9515008                     182872.5
    10563584                    187523.3
    11612160                    184357.2
    12660736                    187779.5
    13709312                    188050.6
    14757888                    188358.7
    15806464                    188660.6
    16855040                    189137.7
    18952192                    189415.0
    21049344                    182262.0
    23146496                    183092.1
    25243648                    184434.2
    27340800                    183828.0
    29437952                    185503.0
    31535104                    184717.2
    33632256                    186353.3
    37826560                    185746.6
    42020864                    184612.9
    46215168                    185253.0
    50409472                    185704.4
    54603776                    186218.8
    58798080                    190486.9
    62992384                    190930.6
    67186688                    185773.8
Finish: Result = PASS.

 

▶ 涨姿势

● 申请全局内存时使用新版本的函数 cudaHostAlloc() 与老版本的函数 cudaMallocHost() 性能没有明显差距。

● 申请全局内存时是否指明合并写入标志,性能没有明显差距。

● 对不同大小的内存随便进行拷贝,可以发现带宽随碎片大小的增大而增大,并逐渐趋于稳定。设备之间的内存拷贝比设备与主机之间的拷贝速度高一个量级。

● 从主机内存向设备进行内存拷贝的时候使用了占位内存 h_cacheClear1 和 h_cacheClear2(共 32M 大小),这是为了消除主机内存缓存对内存拷贝的加速作用。

posted on 2017-12-14 19:59  爨爨爨好  阅读(1036)  评论(0编辑  收藏  举报