使用三种模式(QUICK_MODE,RANGE_MODE,SHMOO_MODE),测试三种拷贝情况下的显存带宽(HostToDevice,DeviceToHost,DeviceToDevice)
▶ 源代码
1 #include <iostream> 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 #include <cuda.h> 5 #include <helper_cuda.h> 6 #include <helper_functions.h> 7 8 #define MEMCOPY_ITERATIONS 100 9 #define DEFAULT_SIZE ( 32 * ( 1 << 20 ) )//32 M 10 #define DEFAULT_INCREMENT (1 << 22) //4 M 11 #define CACHE_CLEAR_SIZE (1 << 24) //16 M 12 #define SHMOO_MEMSIZE_MAX (1 << 26) //64 M 13 #define SHMOO_MEMSIZE_START (1 << 10) //1 KB 14 #define SHMOO_INCREMENT_1KB (1 << 10) //1 KB 15 #define SHMOO_INCREMENT_2KB (1 << 11) //2 KB 16 #define SHMOO_INCREMENT_10KB (10 * (1 << 10)) //10KB 17 #define SHMOO_INCREMENT_100KB (100 * (1 << 10)) //100 KB 18 #define SHMOO_INCREMENT_1MB (1 << 20) //1 MB 19 #define SHMOO_INCREMENT_2MB (1 << 21) //2 MB 20 #define SHMOO_INCREMENT_4MB (1 << 22) //4 MB 21 #define SHMOO_LIMIT_20KB (20 * (1 << 10)) //20 KB 22 #define SHMOO_LIMIT_50KB (50 * (1 << 10)) //50 KB 23 #define SHMOO_LIMIT_100KB (100 * (1 << 10)) //100 KB 24 #define SHMOO_LIMIT_1MB (1 << 20) //1 MB 25 #define SHMOO_LIMIT_16MB (1 << 24) //16 MB 26 #define SHMOO_LIMIT_32MB (1 << 25) //32 MB 27 #define DEBUG 28 29 static bool bDontUseGPUTiming; 30 const char *sMemoryCopyKind[] = { "Device to Host", "Host to Device", "Device to Device", NULL }; 31 const char *sMemoryMode[] = { "PINNED", "PAGEABLE", NULL }; 32 enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE }; 33 enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE }; 34 enum printMode { USER_READABLE, CSV }; 35 enum memoryMode { PINNED, PAGEABLE }; 36 37 void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs) 38 { 39 int i; 40 printf("\n%s Bandwidth, %i Device(s), %s Memory Transfers\n", sMemoryCopyKind[kind], iNumDevs, sMemoryMode[memMode]); 41 printf(" Transfer Size: Bytes\tBandwidth: MB/s\n"); 42 for (i = 0; i < count; i++) 43 printf(" %u\t\t\t%.1f\n", memSizes[i], bandwidths[i]); 44 } 45 46 void printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int iNumDevs, bool wc) 47 { 48 std::string sConfig; 49 if (kind == DEVICE_TO_DEVICE) 50 sConfig += "-D2D"; 51 else 52 { 53 if (kind == DEVICE_TO_HOST) 54 sConfig += "-D2H"; 55 else if (kind == HOST_TO_DEVICE) 56 sConfig += "-H2D"; 57 58 if (memMode == PAGEABLE) 59 sConfig += "-Paged"; 60 else if (memMode == PINNED) 61 { 62 sConfig += "-Pinned"; 63 if (wc) 64 sConfig += "-WriteCombined"; 65 } 66 } 67 for (int i = 0; i < count; i++) 68 printf("BandwidthTest %s, Bandwidth = %.1f MB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n", 69 sConfig.c_str(), bandwidths[i], (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20)), memSizes[i], iNumDevs); 70 } 71 72 void printHelp(void) 73 { 74 printf("Usage: bandwidthTest [OPTION]...\n"); 75 printf("Test the bandwidth for device to host, host to device, and device to device transfers\n"); 76 printf("\n"); 77 printf("Example: measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); 78 printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n"); 79 printf("\n"); 80 printf("Options:\n"); 81 printf("--help\tDisplay this help menu\n"); 82 printf("--csv\tPrint results as a CSV\n"); 83 printf("--device=[deviceno]\tSpecify the device device to be used\n"); 84 printf(" all - compute cumulative bandwidth on all the devices\n"); 85 printf(" 0,1,2,...,n - Specify any particular device to be used\n"); 86 printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); 87 printf(" pageable - pageable memory\n"); 88 printf(" pinned - non-pageable system memory\n"); 89 printf("--mode=[MODE]\tSpecify the mode to use\n"); 90 printf(" quick - performs a quick measurement\n"); 91 printf(" range - measures a user-specified range of values\n"); 92 printf(" shmoo - performs an intense shmoo of a large range of values\n"); 93 printf("--htod\tMeasure host to device transfers\n"); 94 printf("--dtoh\tMeasure device to host transfers\n"); 95 printf("--dtod\tMeasure device to device transfers\n"); 96 #if CUDART_VERSION >= 2020 97 printf("--wc\tAllocate pinned memory as write-combined\n"); 98 #endif 99 printf("--cputiming\tForce CPU-based timing always\n"); 100 printf("Range mode options\n"); 101 printf("--start=[SIZE]\tStarting transfer size in bytes\n"); 102 printf("--end=[SIZE]\tEnding transfer size in bytes\n"); 103 printf("--increment=[SIZE]\tIncrement size in bytes\n"); 104 return; 105 } 106 107 float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) 108 { 109 float elapsedTime = 0.0f; 110 StopWatchInterface *timer = NULL; 111 sdkCreateTimer(&timer); 112 cudaEvent_t start, stop; 113 cudaEventCreate(&start); 114 cudaEventCreate(&stop); 115 116 unsigned char *h_data, *d_data; 117 if (memMode == PINNED)// 使用页锁定内存或者可分页内存 118 { 119 if (CUDART_VERSION >= 2020) 120 { 121 cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : 0); 122 cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : 0); 123 } 124 else 125 { 126 cudaMallocHost((void **)&h_data, memSize); 127 cudaMallocHost((void **)&h_data, memSize); 128 } 129 } 130 else 131 { 132 h_data = (unsigned char *)malloc(memSize);// 先放点东西到设备内存中,在收回的时候测试时间 133 if (h_data == NULL || h_data == NULL) 134 { 135 fprintf(stderr, "\nNo host memory to run testDeviceToHostTransfer\n"); 136 exit(EXIT_FAILURE); 137 } 138 } 139 for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) 140 h_data[i] = (unsigned char)(i & 0xff); 141 cudaMalloc((void **)&d_data, memSize); 142 cudaMemcpy(d_data, h_data, memSize, cudaMemcpyHostToDevice); 143 144 sdkStartTimer(&timer); 145 cudaEventRecord(start, 0); 146 if (memMode == PINNED) 147 { 148 for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) 149 cudaMemcpyAsync(h_data, d_data, memSize, cudaMemcpyDeviceToHost, 0); 150 } 151 else 152 { 153 for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) 154 cudaMemcpy(h_data, d_data, memSize,cudaMemcpyDeviceToHost); 155 } 156 cudaEventRecord(stop, 0); 157 cudaDeviceSynchronize(); 158 sdkStopTimer(&timer); 159 cudaEventElapsedTime(&elapsedTime, start, stop); 160 if (memMode != PINNED || bDontUseGPUTiming)// 不使用页锁定内存时只能使用 sdkGetTimerValue() 来计时 161 elapsedTime = sdkGetTimerValue(&timer); 162 163 sdkDeleteTimer(&timer); 164 cudaEventDestroy(start); 165 cudaEventDestroy(stop); 166 if (memMode == PINNED) 167 cudaFreeHost(h_data); 168 else 169 free(h_data); 170 cudaFree(d_data); 171 return ((float)(1 << 10) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)(1 << 20)); 172 } 173 174 float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) 175 { 176 float elapsedTime = 0.0f; 177 StopWatchInterface *timer = NULL; 178 sdkCreateTimer(&timer); 179 cudaEvent_t start, stop; 180 cudaEventCreate(&start); 181 cudaEventCreate(&stop); 182 183 unsigned char *h_data, *d_data,*h_cacheClear1,*h_cacheClear2; 184 if (PINNED == memMode) 185 { 186 if(CUDART_VERSION >= 2020) 187 cudaHostAlloc((void **)&h_data, memSize, wc ? cudaHostAllocWriteCombined : 0); 188 else 189 cudaMallocHost((void **)&h_data, memSize); 190 } 191 else 192 { 193 h_data = (unsigned char *)malloc(memSize); 194 if (h_data == NULL) 195 { 196 fprintf(stderr, "\nNo host memory to run testHostToDeviceTransfer\n"); 197 exit(EXIT_FAILURE); 198 } 199 } 200 for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) 201 h_data[i] = (unsigned char)(i & 0xff); 202 h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);// 占位内存? 203 h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE); 204 if (h_cacheClear1 == NULL || h_cacheClear2 == NULL) 205 { 206 fprintf(stderr, "\nNo host memory to run testHostToDeviceTransfer\n"); 207 exit(EXIT_FAILURE); 208 } 209 for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) 210 { 211 h_cacheClear1[i] = (unsigned char)(i & 0xff); 212 h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff)); 213 } 214 cudaMalloc((void **)&d_data, memSize); 215 216 sdkStartTimer(&timer); 217 cudaEventRecord(start, 0); 218 if (memMode == PINNED) 219 { 220 for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) 221 cudaMemcpyAsync(d_data, h_data, memSize,cudaMemcpyHostToDevice, 0); 222 } 223 else 224 { 225 for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) 226 cudaMemcpy(d_data, h_data, memSize,cudaMemcpyHostToDevice); 227 } 228 cudaEventRecord(stop, 0); 229 cudaDeviceSynchronize(); 230 sdkStopTimer(&timer); 231 cudaEventElapsedTime(&elapsedTime, start, stop); 232 if (memMode != PINNED || bDontUseGPUTiming) 233 elapsedTime = sdkGetTimerValue(&timer); 234 235 sdkDeleteTimer(&timer); 236 cudaEventDestroy(start); 237 cudaEventDestroy(stop); 238 if (PINNED == memMode) 239 cudaFreeHost(h_data); 240 else 241 free(h_data); 242 free(h_cacheClear1); 243 free(h_cacheClear2); 244 cudaFree(d_data); 245 return ((float)(1 << 10) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)(1 << 20));; 246 } 247 248 float testDeviceToDeviceTransfer(unsigned int memSize) 249 { 250 float elapsedTime = 0.0f; 251 StopWatchInterface *timer = NULL; 252 sdkCreateTimer(&timer); 253 cudaEvent_t start, stop; 254 cudaEventCreate(&start); 255 cudaEventCreate(&stop); 256 257 unsigned char *h_data, *d_idata, *d_odata; 258 h_data = (unsigned char *)malloc(memSize); 259 if (h_data == 0) 260 { 261 fprintf(stderr, "\nNo host memory to run testDeviceToDeviceTransfer\n"); 262 exit(EXIT_FAILURE); 263 } 264 cudaMalloc((void **)&d_idata, memSize); 265 cudaMalloc((void **)&d_odata, memSize); 266 for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) 267 h_data[i] = (unsigned char)(i & 0xff); 268 cudaMemcpy(d_idata, h_data, memSize, cudaMemcpyHostToDevice); 269 270 sdkStartTimer(&timer); 271 cudaEventRecord(start, 0); 272 for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) 273 cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice); 274 cudaEventRecord(stop, 0); 275 cudaDeviceSynchronize(); 276 sdkStopTimer(&timer); 277 cudaEventElapsedTime(&elapsedTime, start, stop); 278 if (bDontUseGPUTiming) 279 elapsedTime = sdkGetTimerValue(&timer); 280 281 sdkDeleteTimer(&timer); 282 cudaEventDestroy(stop); 283 cudaEventDestroy(start); 284 free(h_data); 285 cudaFree(d_idata); 286 cudaFree(d_odata); 287 return 2.0f * ((float)(1 << 10) * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTime * (float)(1 << 20)); 288 } 289 290 void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) 291 { 292 int i; 293 unsigned int count = 1 + ((end - start) / increment); 294 unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); 295 double *bandwidths = (double *)malloc(count * sizeof(double)); 296 for (i = 0; i < count; i++) 297 bandwidths[i] = 0.0; 298 // 逐设备测试 299 for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) 300 { 301 cudaSetDevice(currentDevice); 302 for (i = 0; i < count; i++) 303 { 304 memSizes[i] = start + i * increment; 305 switch (kind) 306 { 307 case DEVICE_TO_HOST: 308 bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); 309 break; 310 case HOST_TO_DEVICE: 311 bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); 312 break; 313 case DEVICE_TO_DEVICE: 314 bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); 315 break; 316 } 317 } 318 } 319 320 if (printmode == CSV) 321 printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); 322 else 323 printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice)); 324 free(memSizes); 325 free(bandwidths); 326 return; 327 } 328 329 void testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) 330 { 331 int i; 332 unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) 333 + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) 334 + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) 335 + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) 336 + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) 337 + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) 338 + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); 339 unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int)); 340 double *bandwidths = (double *)malloc(count * sizeof(double)); 341 for (i = 0; i < count; i++) 342 bandwidths[i] = 0.0; 343 // 逐设备测试 344 for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) 345 { 346 printf("\n"); 347 cudaSetDevice(currentDevice); 348 for (unsigned int memSize = 0, i = 0; memSize <= SHMOO_MEMSIZE_MAX; i++) 349 { 350 if (memSize < SHMOO_LIMIT_20KB) 351 memSize += SHMOO_INCREMENT_1KB; 352 else if (memSize < SHMOO_LIMIT_50KB) 353 memSize += SHMOO_INCREMENT_2KB; 354 else if (memSize < SHMOO_LIMIT_100KB) 355 memSize += SHMOO_INCREMENT_10KB; 356 else if (memSize < SHMOO_LIMIT_1MB) 357 memSize += SHMOO_INCREMENT_100KB; 358 else if (memSize < SHMOO_LIMIT_16MB) 359 memSize += SHMOO_INCREMENT_1MB; 360 else if (memSize < SHMOO_LIMIT_32MB) 361 memSize += SHMOO_INCREMENT_2MB; 362 else 363 memSize += SHMOO_INCREMENT_4MB; 364 memSizes[i] = memSize; 365 switch (kind) 366 { 367 case DEVICE_TO_HOST: 368 bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); 369 break; 370 case HOST_TO_DEVICE: 371 bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); 372 break; 373 case DEVICE_TO_DEVICE: 374 bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); 375 break; 376 } 377 printf("."); 378 } 379 } 380 381 if (CSV == printmode) 382 printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); 383 else 384 printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice)); 385 free(memSizes); 386 free(bandwidths); 387 return; 388 } 389 390 void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) 391 { 392 switch (mode) 393 { 394 case QUICK_MODE: 395 testBandwidthRange(DEFAULT_SIZE, DEFAULT_SIZE, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc); 396 break; 397 case RANGE_MODE: 398 testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc); 399 break; 400 case SHMOO_MODE: 401 testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); 402 break; 403 default: 404 break; 405 } 406 } 407 408 bool test(const int argc, const char **argv) 409 { 410 // 处理命令行参数 411 // 帮助模式,计时器,输出方式 412 if (checkCmdLineFlag(argc, argv, "help")) 413 { 414 printHelp(); 415 return 0; 416 } 417 if (checkCmdLineFlag(argc, argv, "cputiming")) 418 bDontUseGPUTiming = true; 419 printMode printmode = USER_READABLE; 420 if (checkCmdLineFlag(argc, argv, "csv")) 421 printmode = CSV; 422 423 // 内存模式,默认使用页锁定内存 424 memoryMode memMode = PINNED; 425 char *memModeStr; 426 if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) 427 { 428 if (strcmp(memModeStr, "pageable") == 0) 429 memMode = PAGEABLE; 430 else if (strcmp(memModeStr, "pinned") == 0) 431 memMode = PINNED; 432 else 433 { 434 printf("\nInvalid memory mode - valid modes are pageable or pinned\n"); 435 printf("\nSee --help for more information\n"); 436 return false; 437 } 438 } 439 else 440 memMode = PINNED; 441 // 计算能力 2.2 以上,可选 cudaHostAllocWriteCombined 模式 442 bool wc = (CUDART_VERSION >= 2020 && checkCmdLineFlag(argc, argv, "wc")) ? true : false; 443 444 // 设备 445 int startDevice = 0, endDevice = 0; 446 char *device; 447 if (getCmdLineArgumentString(argc, argv, "device", &device)) 448 { 449 int deviceCount; 450 cudaError_t error_id = cudaGetDeviceCount(&deviceCount); 451 if (error_id != cudaSuccess) 452 { 453 printf("\ncudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); 454 exit(EXIT_FAILURE); 455 } 456 if (deviceCount == 0) 457 { 458 printf("\nNo devices found\n"); 459 return false; 460 } 461 if (strcmp(device, "all") == 0) 462 { 463 printf("\nCumulative Bandwidth to be computed from all the devices\n"); 464 startDevice = 0; 465 endDevice = deviceCount - 1; 466 } 467 else 468 { 469 startDevice = endDevice = atoi(device); 470 if (startDevice > deviceCount || startDevice < 0) 471 { 472 printf("\nInvalid GPU number %d given hence default gpu 0 will be used\n", startDevice); 473 startDevice = endDevice = 0; 474 } 475 } 476 } 477 for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++)// 初始化设备 478 { 479 cudaDeviceProp deviceProp; 480 cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); 481 if (error_id == cudaSuccess) 482 { 483 printf(" Device %d: %s\n", currentDevice, deviceProp.name); 484 if (deviceProp.computeMode == cudaComputeModeProhibited) 485 { 486 fprintf(stderr, "\nError: device is running in <Compute Mode Prohibited>\n"); 487 cudaSetDevice(currentDevice); 488 exit(EXIT_FAILURE); 489 } 490 } 491 else 492 { 493 printf("\ncudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); 494 cudaSetDevice(currentDevice); 495 exit(EXIT_FAILURE); 496 } 497 } 498 499 // 测试内容 500 bool htod = checkCmdLineFlag(argc, argv, "htod") ? true : false; 501 bool dtoh = checkCmdLineFlag(argc, argv, "dtoh") ? true : false; 502 bool dtod = checkCmdLineFlag(argc, argv, "dtod") ? true : false; 503 // 测试模式,默认 quick 模式 504 testMode mode; 505 char *modeStr; 506 if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) 507 { 508 if (strcmp(modeStr, "quick") == 0) 509 mode = QUICK_MODE; 510 else if (strcmp(modeStr, "shmoo") == 0) 511 mode = SHMOO_MODE; 512 else if (strcmp(modeStr, "range") == 0) 513 mode = RANGE_MODE; 514 else 515 { 516 printf("\nInvalid mode - valid modes are quick, range, or shmoo\n"); 517 return false; 518 } 519 } 520 else 521 mode = QUICK_MODE; 522 int startSize, endSize, increment; 523 if (mode == RANGE_MODE)// range 模式需要给出最小和最大尺寸 524 { 525 if (checkCmdLineFlag(argc, (const char **)argv, "startSize")) 526 { 527 if ((startSize = getCmdLineArgumentInt(argc, argv, "startSize")) <= 0) 528 { 529 printf("\nIllegal argument - startSize must be greater than zero\n"); 530 return false; 531 } 532 } 533 else 534 { 535 printf("\nMust specify a starting size in range mode\n"); 536 return false; 537 } 538 if (checkCmdLineFlag(argc, (const char **)argv, "endSize")) 539 { 540 if ((endSize = getCmdLineArgumentInt(argc, argv, "endSize")) <= 0) 541 { 542 printf("\nIllegal argument - endSize must be greater than zero\n"); 543 return false; 544 } 545 if (startSize > endSize) 546 { 547 printf("\nIllegal argument - startSize is greater than endSize\n"); 548 return false; 549 } 550 } 551 else 552 { 553 printf("\nMust specify an endSize size in range mode.\n"); 554 return false; 555 } 556 if (checkCmdLineFlag(argc, argv, "increment")) 557 { 558 if ((increment = getCmdLineArgumentInt(argc, argv, "increment")) <= 0) 559 { 560 printf("\nIllegal argument - increment must be greater than zero\n"); 561 return false; 562 } 563 } 564 else 565 { 566 printf("\nMust specify an increment in user mode\n"); 567 return false; 568 } 569 } 570 else 571 startSize = endSize = increment = DEFAULT_SIZE; 572 573 #ifdef DEBUG 574 htod = dtoh = dtod = true; 575 mode = SHMOO_MODE; 576 wc = true; 577 #endif 578 // 运行测试 579 printf(" %s, %s cudaHostAllocWriteCombined", (mode == QUICK_MODE) ? "QUICK_MODE" : ((mode == RANGE_MODE) ? "RANGE_MODE" : "SHMOO_MODE"), wc ? "enable" : "disable"); 580 if (htod) 581 testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc); 582 if (dtoh) 583 testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc); 584 if (dtod) 585 testBandwidth((unsigned int)startSize, (unsigned int)endSize, (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc); 586 for (int nDevice = startDevice; nDevice <= endDevice; nDevice++)// 多设备情况下需要逐一设备同步 587 cudaSetDevice(nDevice); 588 return true; 589 } 590 591 int main(int argc, char **argv) 592 { 593 printf("Start.\n"); 594 printf("Finish: %s.\n", test(argc, (const char **)argv)? "Result = PASS" : "Result = Fail"); 595 getchar(); 596 return 0; 597 }
▶ 输出结果:QUICK_MODE,是否写入合并都尝试了。
Start. Device 0: GeForce GTX 1070 QUICK_MODE, enable cudaHostAllocWriteCombined Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 12014.1 Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 12780.3 Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 185140.2 Finish: Result = PASS.
Start. Device 0: GeForce GTX 1070 QUICK_MODE, disable cudaHostAllocWriteCombined Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 12369.4 Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 12741.7 Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 185244.1 Finish: Result = PASS.
Start. Device 0: GeForce GTX 1070 QUICK_MODE, disenable cudaHostAllocWriteCombined // 使用老版本的函数 cudaMallocHost() Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 12205.4 Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 12814.8 Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 33554432 187538.8 Finish: Result = PASS.
▶ 输出结果:SHMOO_MODE(RANGE_MODE 模式需要给 .exe 额外参数,这里没有测试)
Start. Device 0: GeForce GTX 1070 SHMOO_MODE, enable cudaHostAllocWriteCombined ................................................................................. Host to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 1024 728.8 2048 1319.7 3072 1797.7 4096 2441.4 5120 2921.7 6144 3386.9 7168 3776.8 8192 4027.1 9216 4125.7 10240 4672.5 11264 5063.3 12288 5302.6 13312 5568.9 14336 5814.7 15360 6100.3 16384 6056.2 17408 6459.0 18432 6734.9 19456 6510.4 20480 6950.6 22528 7209.5 24576 7585.0 26624 7812.5 28672 7948.8 30720 8048.6 32768 8137.4 34816 8599.8 36864 8616.7 38912 8899.1 40960 9084.3 43008 9152.1 45056 9276.1 47104 9319.9 49152 9356.9 51200 9536.7 61440 10097.5 71680 10357.5 81920 10209.5 92160 10612.0 102400 10923.5 204800 11632.7 307200 12065.2 409600 12157.0 512000 12240.7 614400 12227.5 716800 12314.8 819200 12379.2 921600 12349.4 1024000 12126.7 1126400 12405.8 2174976 12254.6 3223552 12200.2 4272128 12343.5 5320704 12267.2 6369280 12405.5 7417856 12360.6 8466432 12376.7 9515008 12435.5 10563584 12441.0 11612160 12390.6 12660736 12354.7 13709312 12560.6 14757888 12540.4 15806464 12414.0 16855040 12387.1 18952192 12436.0 21049344 12455.3 23146496 12458.1 25243648 12438.6 27340800 12594.3 29437952 12459.4 31535104 12468.8 33632256 12464.1 37826560 12477.6 42020864 12609.3 46215168 12509.2 50409472 12511.1 54603776 12510.9 58798080 12544.5 62992384 12536.1 67186688 12589.3 ................................................................................. Device to Host Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 1024 1190.9 2048 2164.1 3072 2989.5 4096 3792.5 5120 4398.9 6144 4882.8 7168 5340.6 8192 5908.8 9216 6189.5 10240 6341.3 11264 6975.4 12288 7233.8 13312 7381.0 14336 7724.2 15360 7744.1 16384 8132.7 17408 8178.1 18432 8410.6 19456 8433.9 20480 8719.3 22528 9027.0 24576 9155.3 26624 9474.1 28672 9205.7 30720 9390.0 32768 9701.2 34816 9911.4 36864 10044.6 38912 10271.6 40960 10444.5 43008 10456.5 45056 10480.2 47104 10645.0 49152 10698.2 51200 10847.7 61440 11068.5 71680 11336.5 81920 11522.3 92160 11564.6 102400 11765.8 204800 12276.1 307200 12477.4 409600 12548.2 512000 12626.9 614400 12685.4 716800 12680.3 819200 12572.4 921600 12735.9 1024000 12746.6 1126400 12612.6 2174976 12590.1 3223552 12686.1 4272128 12613.7 5320704 12698.9 6369280 12744.6 7417856 12671.2 8466432 12725.5 9515008 12656.7 10563584 12748.5 11612160 12679.3 12660736 12633.1 13709312 12699.6 14757888 12742.6 15806464 12743.1 16855040 12778.5 18952192 12853.2 21049344 12720.3 23146496 12687.7 25243648 12723.4 27340800 12727.3 29437952 12735.1 31535104 12781.9 33632256 12728.6 37826560 12698.2 42020864 12741.9 46215168 12740.8 50409472 12727.9 54603776 12732.4 58798080 12732.0 62992384 12746.5 67186688 12721.2 ................................................................................. Device to Device Bandwidth, 1 Device(s), PINNED Memory Transfers Transfer Size: Bytes Bandwidth: MB/s 1024 996.5 2048 1606.9 3072 2381.9 4096 3112.5 5120 3875.2 6144 4455.8 7168 5314.0 8192 5986.6 9216 6709.2 10240 7414.0 11264 8016.6 12288 8647.5 13312 9300.6 14336 9943.2 15360 9765.6 16384 13329.8 17408 13697.3 18432 6595.5 19456 16327.5 20480 16054.5 22528 17978.6 24576 19574.6 26624 20897.6 28672 22786.5 30720 24414.1 32768 25933.6 34816 27669.3 36864 29377.2 38912 30542.7 40960 33235.8 43008 35511.4 45056 35319.8 47104 37231.3 49152 39635.4 51200 40521.3 61440 39324.7 71680 69499.6 81920 70531.8 92160 77779.3 102400 85277.7 204800 162211.3 307200 204873.3 409600 258718.8 512000 274195.0 614400 309201.8 716800 261475.0 819200 362004.1 921600 391685.8 1024000 362360.9 1126400 155871.0 2174976 174356.8 3223552 179414.9 4272128 182220.3 5320704 170005.7 6369280 179022.1 7417856 177974.8 8466432 177474.9 9515008 182872.5 10563584 187523.3 11612160 184357.2 12660736 187779.5 13709312 188050.6 14757888 188358.7 15806464 188660.6 16855040 189137.7 18952192 189415.0 21049344 182262.0 23146496 183092.1 25243648 184434.2 27340800 183828.0 29437952 185503.0 31535104 184717.2 33632256 186353.3 37826560 185746.6 42020864 184612.9 46215168 185253.0 50409472 185704.4 54603776 186218.8 58798080 190486.9 62992384 190930.6 67186688 185773.8 Finish: Result = PASS.
▶ 涨姿势
● 申请全局内存时使用新版本的函数 cudaHostAlloc() 与老版本的函数 cudaMallocHost() 性能没有明显差距。
● 申请全局内存时是否指明合并写入标志,性能没有明显差距。
● 对不同大小的内存随便进行拷贝,可以发现带宽随碎片大小的增大而增大,并逐渐趋于稳定。设备之间的内存拷贝比设备与主机之间的拷贝速度高一个量级。
● 从主机内存向设备进行内存拷贝的时候使用了占位内存 h_cacheClear1 和 h_cacheClear2(共 32M 大小),这是为了消除主机内存缓存对内存拷贝的加速作用。