- 近日,CUDA 4.0已经对注册开发者开放,其中增加了不少的功能。其中P2P(Peer-to-Peer )与UVA(Unified Virtual Address Space )的引进最为大家关心。这里与大家一起分享下SDK中的simpleP2P这个例子,他展示了如何使用这两个功能。
-
- 代码如下:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
- #include <cutil_inline.h>
- #include <cuda_runtime_api.h>
- const char *sSDKsample = "simpleP2P";
- __global__ void SimpleKernel(float *src, float *dst)
- {
-
-
- const int idx = blockIdx.x * blockDim.x + threadIdx.x;
- dst[idx] = src[idx] * 2.0f;
- }
- int main(int argc, char **argv)
- {
- printf("[%s] starting.../n", sSDKsample);
-
- printf("Checking for multiple GPUs.../n");
- int gpu_n;
- cutilSafeCall(cudaGetDeviceCount(&gpu_n));
- printf("CUDA-capable device count: %i/n", gpu_n);
- if (gpu_n < 2)
- {
- printf("Two or more Tesla(s) with (SM 2.0) class GPUs are required for %s./n", sSDKsample);
- printf("Waiving test./n");
- printf("PASSED/n");
- exit(EXIT_SUCCESS);
- }
-
- cudaDeviceProp prop_0, prop_1;
- cutilSafeCall(cudaGetDeviceProperties(∝_0, 0));
- cutilSafeCall(cudaGetDeviceProperties(∝_1, 1));
-
- #ifdef _WIN32
- if (prop_0.tccDriver == 0 || prop_1.tccDriver == 0)
- {
- printf("Need to have both GPUs running under TCC driver to use P2P / UVA functionality./n");
- printf("PASSED/n");
- exit(EXIT_SUCCESS);
- }
- #endif // WIN32
-
- printf("Checking for peer access.../n");
- int can_access_peer_0_1, can_access_peer_1_0;
- cutilSafeCall(cudaDeviceCanAccessPeer(&can_access_peer_0_1, 0, 1));
- cutilSafeCall(cudaDeviceCanAccessPeer(&can_access_peer_1_0, 1, 0));
- if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
- {
- printf("Two or more Tesla(s) with (SM 2.0) class GPUs are required for %s./n", sSDKsample);
- printf("Peer access is not available between GPU0 <-> GPU1, waiving test./n");
- printf("PASSED/n");
- exit(EXIT_SUCCESS);
- }
-
- printf("Enabling peer access.../n");
- cutilSafeCall(cudaSetDevice(0));
- cutilSafeCall(cudaDeviceEnablePeerAccess(1, 0));
- cutilSafeCall(cudaSetDevice(1));
- cutilSafeCall(cudaDeviceEnablePeerAccess(0, 0));
-
- printf("Checking for UVA.../n");
- const bool has_uva = prop_0.unifiedAddressing && prop_1.unifiedAddressing;
- if (has_uva == false)
- {
- printf("At least one of the two GPUs has no UVA support/n");
- }
-
- const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
- printf("Allocating buffers (%iMB on GPU0, GPU1 and Host).../n", int(buf_size / 1024 / 1024));
- cutilSafeCall(cudaSetDevice(0));
- float* g0;
- cutilSafeCall(cudaMalloc(&g0, buf_size));
- cutilSafeCall(cudaSetDevice(1));
- float* g1;
- cutilSafeCall(cudaMalloc(&g1, buf_size));
- float* h0;
- if (has_uva)
- cutilSafeCall(cudaMallocHost(&h0, buf_size));
- else
- cutilSafeCall(cudaHostAlloc(&h0, buf_size, cudaHostAllocPortable));
- float *g0_peer, *g1_peer;
- if (has_uva == false)
- {
-
- cutilSafeCall(cudaSetDevice(0));
- cutilSafeCall(cudaPeerRegister(g1, 1, cudaPeerRegisterMapped));
- cutilSafeCall(cudaPeerGetDevicePointer((void **) &g1_peer, g1, 1, 0));
- cutilSafeCall(cudaSetDevice(1));
- cutilSafeCall(cudaPeerRegister(g0, 0, cudaPeerRegisterMapped));
- cutilSafeCall(cudaPeerGetDevicePointer((void **) &g0_peer, g0, 0, 0));
- }
-
- printf("Creating event handles.../n");
- cudaEvent_t start_event, stop_event;
- float time_memcpy;
- int eventflags = cudaEventBlockingSync;
- cutilSafeCall(cudaEventCreateWithFlags(&start_event, eventflags));
- cutilSafeCall(cudaEventCreateWithFlags(&stop_event, eventflags));
-
- cutilSafeCall(cudaEventRecord(start_event, 0));
- for (int i=0; i<100; i++)
- {
-
-
- if (has_uva)
- {
-
- if (i % 2 == 0)
- cutilSafeCall(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
- else
- cutilSafeCall(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
- }
- else
- {
-
- if (i % 2 == 0)
- cutilSafeCall(cudaMemcpyPeer(g1, 1, g0, 0, buf_size));
- else
- cutilSafeCall(cudaMemcpyPeer(g0, 0, g1, 1, buf_size));
- }
- }
- cutilSafeCall(cudaEventRecord(stop_event, 0));
- cutilSafeCall(cudaEventSynchronize(stop_event));
- cutilSafeCall(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
- printf("cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: %.2fGB/s/n",
- (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
-
-
- printf("Preparing host buffer and memcpy to GPU0.../n");
- for (int i=0; i<buf_size / sizeof(float); i++)
- {
- h0[i] = float(i % 4096);
- }
- cutilSafeCall(cudaSetDevice(0));
- if (has_uva)
- cutilSafeCall(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
- else
- cutilSafeCall(cudaMemcpy(g0, h0, buf_size, cudaMemcpyHostToDevice));
-
- const dim3 threads(512, 1);
- const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
-
-
-
- printf("Run kernel on GPU1, taking source data from GPU0 and writing to GPU1.../n");
- cutilSafeCall(cudaSetDevice(1));
- if (has_uva)
- SimpleKernel<<<blocks, threads>>> (g0, g1);
- else
- SimpleKernel<<<blocks, threads>>> (g0_peer, g1);
-
-
- printf("Run kernel on GPU0, taking source data from GPU1 and writing to GPU0.../n");
- cutilSafeCall(cudaSetDevice(0));
- if (has_uva)
- SimpleKernel<<<blocks, threads>>> (g1, g0);
- else
- SimpleKernel<<<blocks, threads>>> (g1_peer, g0);
-
-
- printf("Copy data back to host from GPU0 and verify.../n");
- if (has_uva)
- cutilSafeCall(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
- else
- cutilSafeCall(cudaMemcpy(h0, g0, buf_size, cudaMemcpyHostToDevice));
- int error_count = 0;
- for (int i=0; i<buf_size / sizeof(float); i++)
- {
-
-
- if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
- {
- printf("Verification error, element %i/n", i);
- if (error_count++ > 10)
- break;
- }
- }
- printf((error_count == 0) ? "PASSED/n" : "FAILED/n");
-
- printf("Enabling peer access.../n");
- cutilSafeCall(cudaSetDevice(0));
- cutilSafeCall(cudaDeviceDisablePeerAccess(1));
- cutilSafeCall(cudaSetDevice(1));
- cutilSafeCall(cudaDeviceDisablePeerAccess(0));
-
- printf("Shutting down.../n");
- cutilSafeCall(cudaEventDestroy(start_event));
- cutilSafeCall(cudaEventDestroy(stop_event));
- cutilSafeCall(cudaSetDevice(0));
- cutilSafeCall(cudaFree(g0));
- cutilSafeCall(cudaSetDevice(1));
- cutilSafeCall(cudaFree(g1));
- cutilSafeCall(cudaFreeHost(h0));
- cudaDeviceReset();
- cutilExit(argc, argv);
- }
从这段代码可以看出,目前仅有Fermi架构的tesla卡才能支持到P2P功能。由于UVA的需要,想成功编译运行程序,需要编译成64程序。
而且如果支持UVA(其实现在如果是支持P2P的卡理论上应该都是支持UVA的),可以使用cudaMemcpyDefault代替原有的cudaMemcpyHostToDevice等方式,
而且在内核函数等的调用上,不在需要分别获取各设备内存单独地址,大大的缩减了代码的编写量。