对比使用单流和多流(4条)情况下数据拷贝,以及数据拷贝加内核调用的效率差别。
▶ 源代码
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 #include <helper_functions.h> 5 #include <helper_cuda.h> 6 7 // 默认使用 windows64 系统,使用 64-bit 目标代码,码删掉了对其他系统的支持 8 #define MEMORY_ALIGNMENT 4096 // 内存对齐到 4KB 9 #define ALIGN_UP(x,size) (((size_t)x+(size-1))&(~(size-1)) ) // x 除以 size 向上取整 10 11 __global__ void init_array(int *g_data, int *factor, int num_iterations) 12 { 13 int idx = blockIdx.x * blockDim.x + threadIdx.x; 14 for (int i = 0; i < num_iterations; i++) 15 g_data[idx] += *factor; 16 } 17 18 bool check(int *a, const int nArray, const int c) 19 { 20 for (int i = 0; i < nArray; i++) 21 { 22 if (a[i] != c) 23 { 24 printf("\nArray\tError at i = %d, %d, %d\n", i, a[i], c); 25 return false; 26 } 27 } 28 return true; 29 } 30 31 inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nByte) 32 { 33 if (bPinGenericMemory)// 申请原生页对齐锁定内存 34 { 35 printf("\nVirtualAlloc(), %4.2f MB (generic page-aligned system memory)\n", (float)nByte/1048576.0f); 36 *pp_a = (int *) VirtualAlloc(NULL, (nByte + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); 37 *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT); 38 cudaHostRegister(*ppAligned_a, nByte, cudaHostRegisterMapped); // 页锁定内存,异步拷贝必需 39 } 40 else 41 { 42 printf("\ncudaMallocHost(), %4.2f MB\n", (float)nByte/1048576.0f); 43 cudaMallocHost((void **)pp_a, nByte); // 申请时已经页锁定 44 *ppAligned_a = *pp_a; 45 } 46 } 47 48 int main()// 使用默认参数,不再从命令行中获取参数 49 { 50 printf("\n\tStart\n"); 51 int nreps = 100; // 核函数测试次数 52 int niterations = 5; // 核函数中的重复次数 53 int nstreams = 4; // 使用的流数 54 float elapsed_time; 55 bool bPinGenericMemory; 56 57 cudaSetDevice(0);// 删掉了筛选设备的过程 58 cudaDeviceProp deviceProp; 59 cudaGetDeviceProperties(&deviceProp, 0); 60 if (deviceProp.canMapHostMemory)// 检查 GPU 是否支持主机内存映射,否则原生内存还是不能用 61 bPinGenericMemory = true; 62 else 63 { 64 printf("\nDevice not support mapping of generic host memory, use cudaMallocHost() instead\n"); 65 bPinGenericMemory = false; 66 } 67 68 // 流处理器个数不足 32 时降低测试负载(源代码没有减少 nByte 的大小,已改进) 69 float scale_factor = max(32.0f / float(_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount), 1.0f); 70 int nArray = (int)rint((float)16 * 1024 * 1024 / scale_factor); // 测试数组元素个数 71 int nByte = nArray * sizeof(int); // 测试数组内存大小 72 printf("\nWorkload *= %1.4f, array_size = %d\n", 1.0f / scale_factor, nArray); 73 74 cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | (bPinGenericMemory ? cudaDeviceMapHost : 0));// 使用线程块同步,减少 CPU 的使用 75 76 int *h_a = 0, *hAligned_a = 0; 77 AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nByte);// 使用设定的方式申请内存 78 int c = 5, *d_a = 0, *d_c = 0; 79 cudaMalloc((void **)&d_a, nByte); 80 cudaMemset(d_a, 0x0, nByte); 81 cudaMalloc((void **)&d_c, sizeof(int)); 82 cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice); 83 cudaEvent_t start_event, stop_event; 84 cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync); 85 cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync); 86 cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); 87 for (int i = 0; i < nstreams; i++) 88 cudaStreamCreate(&(streams[i])); 89 90 printf("\n\tStart test\n"); 91 // 异步拷贝测试 92 cudaEventRecord(start_event, 0); 93 cudaMemcpyAsync(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost, streams[0]); 94 cudaEventRecord(stop_event, 0); 95 cudaEventSynchronize(stop_event); 96 cudaEventElapsedTime(&elapsed_time, start_event, stop_event); 97 printf("memcopy:\t%.2f\n", elapsed_time); 98 99 // 核函数测试 100 dim3 threads = dim3(512); 101 dim3 blocks = dim3(nArray / threads.x); 102 cudaEventRecord(start_event, 0); 103 init_array << <blocks, threads, 0, streams[0] >> > (d_a, d_c, niterations); 104 cudaEventRecord(stop_event, 0); 105 cudaEventSynchronize(stop_event); 106 cudaEventElapsedTime(&elapsed_time, start_event, stop_event); 107 printf("kernel:\t\t%.2f\n", elapsed_time); 108 109 // 串行测试 110 cudaEventRecord(start_event, 0); 111 for (int k = 0; k < nreps; k++) 112 { 113 init_array << <blocks, threads >> > (d_a, d_c, niterations); 114 cudaMemcpy(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost); 115 } 116 cudaEventRecord(stop_event, 0); 117 cudaEventSynchronize(stop_event); 118 cudaEventElapsedTime(&elapsed_time, start_event, stop_event); 119 printf("non-streamed:\t%.2f\n", elapsed_time / nreps); 120 121 // 多流测试 122 blocks = dim3(nArray / (nstreams*threads.x), 1); 123 memset(hAligned_a, 255, nByte); 124 cudaMemset(d_a, 0, nByte); 125 cudaEventRecord(start_event, 0); 126 for (int k = 0; k < nreps; k++) // 分流给出内核函数和数据回传工作 127 { 128 for (int i = 0; i < nstreams; i++) 129 init_array << <blocks, threads, 0, streams[i] >> > (d_a + i *nArray / nstreams, d_c, niterations); 130 for (int i = 0; i < nstreams; i++) 131 cudaMemcpyAsync(hAligned_a + i * nArray / nstreams, d_a + i * nArray / nstreams, nByte / nstreams, cudaMemcpyDeviceToHost, streams[i]); 132 } 133 cudaEventRecord(stop_event, 0); 134 cudaEventSynchronize(stop_event); 135 cudaEventElapsedTime(&elapsed_time, start_event, stop_event); 136 printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps); 137 138 // 检查结果和回收工作 139 printf("\n\tResult: %s\n", check(hAligned_a, nArray, c*nreps*niterations)?"Passed":"Failed"); 140 cudaFree(d_a); 141 cudaFree(d_c); 142 if (bPinGenericMemory) 143 { 144 cudaHostUnregister(hAligned_a); 145 VirtualFree(h_a, 0, MEM_RELEASE); 146 } 147 else 148 cudaFreeHost(h_a); 149 cudaEventDestroy(start_event); 150 cudaEventDestroy(stop_event); 151 for (int i = 0; i < nstreams; i++) 152 cudaStreamDestroy(streams[i]); 153 154 getchar(); 155 return 0; 156 }
▶ 输出结果
Start Workload *= 1.0000, array_size = 16777216 VirtualAlloc(), 64.00 MB (generic page-aligned system memory) Start test memcopy: 5.34 kernel: 5.15 non-streamed: 9.95 4 streams: 5.24 Result: Passed
▶ 涨姿势
● 涉及的宏和内部函数原型
1 // driver types.h 2 #define cudaStreamPerThread ((cudaStream_t)0x2) 3 4 #define cudaEventDefault 0x00 // Default event flag 5 #define cudaEventBlockingSync 0x01 // Event uses blocking synchronization 6 #define cudaEventDisableTiming 0x02 // Event will not record timing data 7 #define cudaEventInterprocess 0x04 // Event is suitable for interprocess use. cudaEventDisableTiming must be set 8 9 #define cudaDeviceScheduleAuto 0x00 // Device flag - Automatic scheduling 10 #define cudaDeviceScheduleSpin 0x01 // Device flag - Spin default scheduling 11 #define cudaDeviceScheduleYield 0x02 // Device flag - Yield default scheduling 12 #define cudaDeviceScheduleBlockingSync 0x04 // Device flag - Use blocking synchronization 13 #define cudaDeviceBlockingSync 0x04 // Device flag - Use blocking synchronization 14 deprecated This flag was deprecated as of CUDA 4.0 and 15 replaced with ::cudaDeviceScheduleBlockingSync. 16 #define cudaDeviceScheduleMask 0x07 // Device schedule flags mask 17 #define cudaDeviceMapHost 0x08 // Device flag - Support mapped pinned allocations 18 #define cudaDeviceLmemResizeToMax 0x10 // Device flag - Keep local memory allocation after launch 19 #define cudaDeviceMask 0x1f // Device flags mask 20 21 #define cudaArrayDefault 0x00 // Default CUDA array allocation flag 22 #define cudaArrayLayered 0x01 // Must be set in cudaMalloc3DArray to create a layered CUDA array 23 #define cudaArraySurfaceLoadStore 0x02 // Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array 24 #define cudaArrayCubemap 0x04 // Must be set in cudaMalloc3DArray to create a cubemap CUDA array 25 #define cudaArrayTextureGather 0x08 // Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array 26 27 #define cudaIpcMemLazyEnablePeerAccess 0x01 // Automatically enable peer access between remote devices as needed 28 29 #define cudaMemAttachGlobal 0x01 // Memory can be accessed by any stream on any device 30 #define cudaMemAttachHost 0x02 // Memory cannot be accessed by any stream on any device 31 #define cudaMemAttachSingle 0x04 // Memory can only be accessed by a single stream on the associated device 32 33 #define cudaOccupancyDefault 0x00 // Default behavior 34 #define cudaOccupancyDisableCachingOverride 0x01 // Assume global caching is enabled and cannot be automatically turned off 35 36 #define cudaCpuDeviceId ((int)-1) // Device id that represents the CPU 37 #define cudaInvalidDeviceId ((int)-2) // Device id that represents an invalid device 38 39 // cuda_runtime_api.h 40 extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ); 41 42 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); 43 44 extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags); 45 46 extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr); 47 48 49 // memoryapi.h 50 WINBASEAPI _Ret_maybenull_ _Post_writable_byte_size_(dwSize) LPVOID WINAPI VirtualAlloc \ 51 ( \ 52 _In_opt_ LPVOID lpAddress, _In_ SIZE_T dwSize, _In_ DWORD flAllocationType, _In_ DWORD flProtect \ 53 ); 54 55 WINBASEAPI BOOL WINAPI VirtualFree \ 56 ( 57 _Pre_notnull_ _When_(dwFreeType == MEM_DECOMMIT, _Post_invalid_) _When_(dwFreeType == MEM_RELEASE, _Post_ptr_invalid_) LPVOID lpAddress, 58 _In_ SIZE_T dwSize, 59 _In_ DWORD dwFreeType 60 ); 61 62 // winnt.h 63 #define PAGE_READWRITE 0x04 64 #define MEM_COMMIT 0x1000 65 #define MEM_RESERVE 0x2000
● 使用原生页对齐锁定内存的步骤
1 #define CEIL(x,y) (((x) - 1) / (y) + 1) 2 3 int sizeByte = sizeof(int) * 16 * 1024 * 1024; 4 int align = 4096; 5 int *p, *pAlign; 6 p= (int *)VirtualAlloc(NULL, (sizeByte + align), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); 7 pAlign = (int *)CEIL(*p, align); 8 cudaHostRegister(pAlign, sizeByte, cudaHostRegisterMapped); 9 10 ... 11 12 cudaHostUnregister(pAlign); 13 VirtualFree(p, 0, MEM_RELEASE);
● 使用函数 cudaEventCreateWithFlags() 相关来计时,与之前的函数 cudaEventCreate() 稍有不同。
1 float elapsed_time = 0.0f; 2 cudaEvent_t start_event, stop_event; 3 cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync); 4 cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync); 5 cudaEventRecord(start_event, 0); 6 7 ... 8 9 cudaEventRecord(stop_event, 0); 10 cudaEventSynchronize(stop_event); 11 cudaEventElapsedTime(&elapsed_time, start_event, stop_event); 12 13 cudaEventDestroy(start_event); 14 cudaEventDestroy(stop_event);
cudaEventCreateWithFlags