使用 P2P 特性在 GPU 之间传输、读写数据。
▶ 源代码。包括 P2P 使用前的各项检查,设备之间的数据互拷,主机和设备之间数据传输和相互访问。
1 #include <stdlib.h> 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_cuda.h> 6 #include <helper_functions.h> 7 8 #define MAX_GPU_COUNT 64 9 10 __global__ void SimpleKernel(float *src, float *dst) 11 { 12 const int idx = blockIdx.x * blockDim.x + threadIdx.x; 13 dst[idx] = src[idx] * 2.0f; 14 } 15 16 inline bool IsGPUCapableP2P(cudaDeviceProp *pProp) 17 { 18 #ifdef _WIN32 19 return (bool)(pProp->tccDriver ? true : false); 20 #else 21 return (bool)(pProp->major >= 2); 22 #endif 23 } 24 25 int main(int argc, char **argv) 26 { 27 printf("\n\tStarting\n", argv[0]); 28 29 // 检查是否使用 64 位操作系统环境 30 if (sizeof(void*) != 8) 31 { 32 printf("\n\tError for program only supported with 64-bit OS and 64-bit target\n"); 33 return EXIT_WAIVED; 34 } 35 36 // 找到头两块计算能力不小于 2.0 的设备 37 int gpu_n; 38 cudaGetDeviceCount(&gpu_n); 39 printf("\n\tDevice count: %d\n", gpu_n); 40 if (gpu_n < 2) 41 { 42 printf("\n\tError for two or more GPUs with SM2.0 required\n"); 43 return EXIT_WAIVED; 44 } 45 46 cudaDeviceProp prop[MAX_GPU_COUNT]; 47 int gpuid[MAX_GPU_COUNT], gpu_count = 0; 48 printf("\n\tShow device\n");// 展示所有设备 49 for (int i=0; i < gpu_n; i++) 50 { 51 cudaGetDeviceProperties(&prop[i], i); 52 if ((prop[i].major >= 2) 53 #ifdef _WIN32 54 && prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动 55 #endif 56 ) 57 gpuid[gpu_count++] = i; 58 printf("\n\tGPU%d = \"%15s\" ---- %s\n", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO")); 59 } 60 if (gpu_count < 2) 61 { 62 printf("\n\tError for two or more GPUs with SM2.0 required\n"); 63 #ifdef _WIN32 64 printf("\nOr for TCC driver required\n"); 65 #endif 66 cudaSetDevice(0); 67 return EXIT_WAIVED; 68 } 69 70 // 寻找测试设备 71 int can_access_peer, p2pCapableGPUs[2]; 72 p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1; 73 printf("\n\tShow combination of devices with P2P\n");// 展示所有能 P2P 的设备组合 74 for (int i = 0; i < gpu_count - 1; i++) 75 { 76 for (int j = i + 1; j < gpu_count; j++) 77 { 78 cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]); 79 if (can_access_peer) 80 { 81 printf("\n\tGPU%d (%s) <--> GPU%d (%s) : %s\n", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name); 82 if (p2pCapableGPUs[0] == -1) 83 p2pCapableGPUs[0] = gpuid[i], p2pCapableGPUs[1] = gpuid[j]; 84 } 85 } 86 } 87 if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) 88 { 89 printf("\n\tError for P2P not available among GPUs\n"); 90 for (int i=0; i < gpu_count; i++) 91 cudaSetDevice(gpuid[i]); 92 return EXIT_WAIVED; 93 } 94 95 // 使用找到的设备进行测试 96 gpuid[0] = p2pCapableGPUs[0]; 97 gpuid[1] = p2pCapableGPUs[1]; 98 printf("\n\tEnabling P2P between GPU%d and GPU%d\n", gpuid[0], gpuid[1]); 99 100 // 启用 P2P 101 cudaSetDevice(gpuid[0]); 102 cudaDeviceEnablePeerAccess(gpuid[1], 0); 103 cudaSetDevice(gpuid[1]); 104 cudaDeviceEnablePeerAccess(gpuid[0], 0); 105 106 // 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space,UVA) 107 if (!(prop[gpuid[0]].unifiedAddressing && prop[gpuid[1]].unifiedAddressing)) 108 printf("\n\tError for GPU not support UVA\n"); 109 return EXIT_WAIVED; 110 111 // 申请内存 112 const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); 113 printf("\n\tAllocating buffers %iMB\n", int(buf_size / 1024 / 1024)); 114 cudaSetDevice(gpuid[0]); 115 float *g0; 116 cudaMalloc(&g0, buf_size); 117 cudaSetDevice(gpuid[1]); 118 float *g1; 119 cudaMalloc(&g1, buf_size); 120 float *h0; 121 cudaMallocHost(&h0, buf_size); 122 123 cudaEvent_t start_event, stop_event; 124 int eventflags = cudaEventBlockingSync; 125 float time_memcpy; 126 cudaEventCreateWithFlags(&start_event, eventflags); 127 cudaEventCreateWithFlags(&stop_event, eventflags); 128 cudaEventRecord(start_event, 0); 129 130 for (int i=0; i<100; i++) 131 { 132 // GPU 互拷 133 // UVA 特性下 cudaMemcpyDefault 直接根据指针(属于主机还是设备)来确定拷贝方向 134 if (i % 2 == 0) 135 cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault); 136 else 137 cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault); 138 } 139 cudaEventRecord(stop_event, 0); 140 cudaEventSynchronize(stop_event); 141 cudaEventElapsedTime(&time_memcpy, start_event, stop_event); 142 printf("\n\tcudaMemcpy: %.2fGB/s\n", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f))); 143 144 for (int i=0; i<buf_size / sizeof(float); i++) 145 h0[i] = float(i % 4096); 146 cudaSetDevice(gpuid[0]); 147 cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault); 148 149 const dim3 threads(512, 1); 150 const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1); 151 152 // 使用 GPU1 读取 GPU0 的全局内存数据,计算并写入 GPU1 的全局内存 153 printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[1], gpuid[0], gpuid[1]); 154 cudaSetDevice(gpuid[1]); 155 SimpleKernel<<<blocks, threads>>>(g0, g1); 156 cudaDeviceSynchronize(); 157 158 // 使用 GPU0 读取 GPU1 的全局内存数据,计算并写入 GPU0 的全局内存 159 printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[0], gpuid[1], gpuid[0]); 160 cudaSetDevice(gpuid[0]); 161 SimpleKernel<<<blocks, threads>>>(g1, g0); 162 cudaDeviceSynchronize(); 163 164 // 检查结果 165 cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault); 166 int error_count = 0; 167 for (int i=0; i<buf_size / sizeof(float); i++) 168 { 169 if (h0[i] != float(i % 4096) * 2.0f * 2.0f) 170 { 171 printf("\n\tResult error at %i: gpu[i] = %f, cpu[i] = %f\n", i, h0[i], (float(i%4096)*2.0f*2.0f)); 172 if (error_count++ > 10) 173 break; 174 } 175 } 176 177 // 关闭 P2P 178 cudaSetDevice(gpuid[0]); 179 cudaDeviceDisablePeerAccess(gpuid[1]); 180 cudaSetDevice(gpuid[1]); 181 cudaDeviceDisablePeerAccess(gpuid[0]); 182 183 // 回收工作 184 cudaFreeHost(h0); 185 cudaSetDevice(gpuid[0]); 186 cudaFree(g0); 187 cudaSetDevice(gpuid[1]); 188 cudaFree(g1); 189 cudaEventDestroy(start_event); 190 cudaEventDestroy(stop_event); 191 for (int i=0; i<gpu_n; i++) 192 cudaSetDevice(i); 193 printf("\n\t%s!\n",error_count?"Test failed": "Test passed"); 194 195 getchar(); 196 return 0; 197 }
▶ 输出结果
只有一台设备,暂无法进行测试
▶ 涨姿势:
● P2P 要求:至少两台计算能力不低于 2.0 的设备,并支持同一可视内存空间特性;计算环境不低于 CUDA 4.0;Windows 安装 Tesla 计算集群驱动。
● 使用P2P的关键步骤
1 // 检查两台设备之间是否能使用 P2P 2 int can_access_peer; 3 cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); 4 5 // 启用 P2P 6 cudaSetDevice(gpuid[i]); 7 cudaDeviceEnablePeerAccess(gpuid[j], 0); 8 cudaSetDevice(gpuid[j]; 9 cudaDeviceEnablePeerAccess(gpuid[i], 0); 10 11 // 设备间传输数据 12 cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault); 13 14 // 关闭 P2P 15 cudaSetDevice(gpuid[i]); 16 cudaDeviceDisablePeerAccess(gpuid[i]); 17 cudaSetDevice(gpuid[j]); 18 cudaDeviceDisablePeerAccess(gpuid[j]); 19 20 // cuda_runtime_api.h 21 extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice); 22 23 extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags); 24 25 extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);
● 其他代码中的定义
1 // helper_string.h 2 #define EXIT_WAIVED 2