1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery - 爨爨爨好

公告

1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery

使用 Runtime API 和 Driver API 检测设备相关属性。并检测了设备之间的拓扑以及主机与设备之间的拓扑（是否支持跨设备原子操作）。

▶ 源代码：Runtime API

  1 #include <memory>
  2 #include <iostream>
  3 #include <cuda_runtime.h>
  4 #include <helper_cuda.h>
  5 
  6 #if CUDART_VERSION < 5000
  7 #include <cuda.h>
  8 
  9 template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)// 将 Driver API 的获取属性函数放到模板中
 10 {
 11     CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
 12     if (CUDA_SUCCESS != error)
 13     {
 14         fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
 15         exit(EXIT_FAILURE);
 16     }
 17 }
 18 #endif
 19 
 20 int main()
 21 {
 22     printf("Start.\n");
 23     printf("\n\tCUDA Device Query (Runtime API) version (CUDART static linking)\n");
 24 
 25     int deviceCount;
 26     cudaError_t error_id;
 27     if ((error_id = cudaGetDeviceCount(&deviceCount)) != cudaSuccess)
 28     {
 29         printf("\ncudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
 30         printf("\nResult = Fail\n");
 31         exit(EXIT_FAILURE);
 32     }
 33     printf("\nDetected %d CUDA Capable device(s)\n", deviceCount);
 34 
 35     int dev, driverVersion, runtimeVersion;
 36     for (dev = 0; dev < deviceCount; ++dev)
 37     {
 38         cudaSetDevice(dev);
 39         cudaDeviceProp deviceProp;
 40         cudaGetDeviceProperties(&deviceProp, dev);
 41         printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
 42 
 43         cudaDriverGetVersion(&driverVersion);
 44         cudaRuntimeGetVersion(&runtimeVersion);
 45         printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
 46             driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 47         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
 48         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
 49             (float)deviceProp.totalGlobalMem / 1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
 50         printf("  Multiprocessors: %2d, CUDA Cores/MP: %3d        %d CUDA Cores\n",
 51             deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
 52             _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
 53         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
 54 #if CUDART_VERSION >= 5000
 55         printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
 56         printf("  Memory Bus Width:                              %d-bit\n", deviceProp.memoryBusWidth);
 57         if (deviceProp.l2CacheSize)
 58             printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
 59 #else// 在CUDA 4.0 - 4.2 中，需要通过 Driver API 来访问相关属性
 60         int memoryClock;
 61         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
 62         printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
 63         int memBusWidth;
 64         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 65         printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
 66         int L2CacheSize;
 67         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 68         if (L2CacheSize)
 69             printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
 70 #endif
 71         printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
 72             deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
 73             deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
 74         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
 75             deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
 76         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
 77             deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
 78         printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
 79         printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
 80         printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
 81         printf("  Warp size:                                     %d\n", deviceProp.warpSize);
 82         printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
 83         printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
 84         printf("  Max dimension size of a thread block (x,y,z):  (%d, %d, %d)\n",
 85             deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
 86         printf("  Max dimension size of a grid size    (x,y,z):  (%d, %d, %d)\n",
 87             deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
 88         printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
 89         printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
 90         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
 91         printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
 92         printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
 93         printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
 94         printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
 95         printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 96 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 97         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ?
 98             "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 99 #endif
100         printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
101         printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
102         printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
103         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
104 
105         const char *sComputeMode[] =
106         {
107             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
108             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
109             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
110             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
111             "Unknown",
112             NULL
113         };
114         printf("  Compute Mode: < %s >\n", sComputeMode[deviceProp.computeMode]);
115     }
116 
117     if (deviceCount >= 2)// 多设备情形，找出最靠前的两张支持 P2P 的设备
118     {
119         cudaDeviceProp prop[64];
120         int gpuid[64], count = 0, can_access_peer;
121 
122         for (int i = 0; i < deviceCount; i++)// 在 gpuid 中记录支持 P2P 的设备编号
123         {
124             cudaGetDeviceProperties(&prop[i], i);
125 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)// Windows 系统需要安装 Tesla 计算集群驱动
126             if ((prop[i].major >= 2) && prop[i].tccDriver)
127 #else
128             if ((prop[i].major >= 2))
129 #endif
130                 gpuid[count++] = i;
131         }
132         if (count >= 2)
133         {
134             for (int i = 0; i < count - 1; i++)
135             {
136                 for (int j = i + 1; j < count; j++)
137                 {
138                     cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
139                     printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s\n",
140                         prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], can_access_peer ? "Yes" : "No");
141                 }
142             }
143         }
144     }
145 
146     // 设备环境总况
147     printf("\n");
148     std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
149     char cTemp[16];
150 
151     sProfileString += ", NumDevs = ";// 设备数
152 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
153     sprintf_s(cTemp, 10, "%d", deviceCount);
154 #else
155     sprintf(cTemp, "%d", deviceCount);
156 #endif
157     sProfileString += cTemp;
158 
159     sProfileString += ", CUDA Driver Version = ";// Driver 版本
160 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
161     sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
162 #else
163     sprintf(cTemp, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
164 #endif
165     sProfileString += cTemp;
166 
167     sProfileString += ", CUDA Runtime Version = ";// Runtime 版本
168 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
169     sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
170 #else
171     sprintf(cTemp, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
172 #endif
173     sProfileString += cTemp;
174     printf("\n%s\n", sProfileString.c_str());
175 
176     printf("\nFinish: Result = Pass\n");
177     getchar();
178     return 0;
179 }

▶ 输出结果：

Start.

    CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 1070"
  CUDA Driver Version / Runtime Version          9.0 / 9.0
  CUDA Capability Major/Minor version number:    6.1
  Total amount of global memory:                 8192 MBytes (8589934592 bytes)
  Multiprocessors: 16, CUDA Cores/MP: 128        2048 CUDA Cores
  GPU Max Clock rate:                            1645 MHz (1.64 GHz)
  Memory Clock rate:                             4004 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 2097152 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z):  (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z):  (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Supports Cooperative Kernel Launch:            No
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >


deviceQuery, CUDA Driver = CUDART, NumDevs = 1, CUDA Driver Version = 9.0, CUDA Runtime Version = 9.0

Finish: Result = Pass

▶ 源代码：Driver API

  1 #include <stdio.h>
  2 #include <cuda.h>
  3 #include <helper_cuda_drvapi.h>
  4 
  5 int main(int argc, char **argv)
  6 {
  7     printf("Start.\n");
  8     printf("CUDA Device Query (Driver API) version (CUDART static linking)\n");
  9 
 10     CUresult error_id;
 11     if ((error_id = cuInit(0)) != CUDA_SUCCESS)
 12     {
 13         printf("\ncuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id));
 14         printf("\nResult = Fail\n");
 15         exit(EXIT_FAILURE);
 16     }
 17     int deviceCount = 0;
 18     if ((error_id = cuDeviceGetCount(&deviceCount)) != CUDA_SUCCESS)
 19     {
 20         printf("\ncuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
 21         printf("\nResult = FAIL\n");
 22         exit(EXIT_FAILURE);
 23     }
 24     printf("\nDetected %d CUDA Capable device(s)\n", deviceCount);
 25     for (CUdevice dev = 0; dev < deviceCount; ++dev)
 26     {
 27         char deviceName[256];
 28         if ((error_id = cuDeviceGetName(deviceName, 256, dev)) != CUDA_SUCCESS)
 29         {
 30             printf("\ncuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
 31             printf("\nResult = FAIL\n");
 32             exit(EXIT_FAILURE);
 33         }
 34         printf("\nDevice %d: \"%s\"\n", dev, deviceName);
 35         int driverVersion;
 36         cuDriverGetVersion(&driverVersion);
 37         printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
 38         int major, minor;
 39         if ((error_id = cuDeviceComputeCapability(&major, &minor, dev)) != CUDA_SUCCESS)
 40         {
 41             printf("\ncuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
 42             printf("\nResult = FAIL\n");
 43             exit(EXIT_FAILURE);
 44         }
 45         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);
 46         size_t totalGlobalMem;
 47         if ((error_id = cuDeviceTotalMem(&totalGlobalMem, dev)) != CUDA_SUCCESS)
 48         {
 49             printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
 50             printf("Result = FAIL\n");
 51             exit(EXIT_FAILURE);
 52         }
 53         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
 54             (float)totalGlobalMem / 1048576.0f, (unsigned long long) totalGlobalMem);
 55         int multiProcessorCount;
 56         getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 57         printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n", 
 58             multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
 59         int clockRate;
 60         getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 61         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f);
 62         int memoryClock;
 63         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
 64         printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
 65         int memBusWidth;
 66         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
 67         printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
 68         int L2CacheSize;
 69         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
 70         if (L2CacheSize)
 71             printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
 72         int maxTex1D, maxTex2D[2], maxTex3D[3];
 73         getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
 74         getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
 75         getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
 76         getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
 77         getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
 78         getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
 79         printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) 3D=(%d, %d, %d)\n", 
 80             maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]);
 81         int  maxTex1DLayered[2];
 82         getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
 83         getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
 84         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n", maxTex1DLayered[0], maxTex1DLayered[1]);
 85         int  maxTex2DLayered[3];
 86         getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
 87         getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
 88         getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
 89         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
 90             maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
 91         int totalConstantMemory;
 92         getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
 93         printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
 94         int sharedMemPerBlock;
 95         getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
 96         printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
 97         int regsPerBlock;
 98         getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 99         printf("  Total number of registers available per block: %d\n", regsPerBlock);
100         int warpSize;
101         getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
102         printf("  Warp size:                                     %d\n", warpSize);
103         int maxThreadsPerMultiProcessor;
104         getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
105         printf("  Maximum number of threads per multiprocessor:  %d\n", maxThreadsPerMultiProcessor);
106         int maxThreadsPerBlock;
107         getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
108         printf("  Maximum number of threads per block:           %d\n", maxThreadsPerBlock);
109         int blockDim[3];
110         getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
111         getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
112         getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
113         printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
114         int gridDim[3];
115         getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
116         getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
117         getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
118         printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
119         int textureAlign;
120         getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
121         printf("  Texture alignment:                             %u bytes\n", textureAlign);
122         int memPitch;
123         getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
124         printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
125         int gpuOverlap;
126         getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
127         int asyncEngineCount;
128         getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
129         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
130         int kernelExecTimeoutEnabled;
131         getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
132         printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
133         int integrated;
134         getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
135         printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");
136         int canMapHostMemory;
137         getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
138         printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
139         int concurrentKernels;
140         getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
141         printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
142         int surfaceAlignment;
143         getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
144         printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");
145         int eccEnabled;
146         getCudaAttribute<int>(&eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
147         printf("  Device has ECC support:                        %s\n", eccEnabled ? "Enabled" : "Disabled");
148 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
149         int tccDriver ;
150         getCudaAttribute<int>(&tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
151         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", tccDriver ?
152             "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
153 #endif
154         int unifiedAddressing;
155         getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
156         printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");
157         int cooperativeLaunch;
158         getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
159         printf("  Supports Cooperative Kernel Launch:            %s\n", cooperativeLaunch ? "Yes" : "No");
160         int cooperativeMultiDevLaunch;
161         getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
162         printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
163         int pciDomainID, pciBusID, pciDeviceID;
164         getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
165         getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
166         getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
167         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
168         
169         const char *sComputeMode[] =
170         {
171             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
172             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
173             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
174             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
175             "Unknown",
176             NULL
177         };
178         int computeMode;
179         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
180         printf("  Compute Mode: < %s >\n", sComputeMode[computeMode]);
181     }
182 
183     if (deviceCount >= 2)// 多设备情形
184     {
185         int gpuid[64], count = 0, major, minor, tccDriver, can_access_peer;
186         for (int i = 0; i < deviceCount; i++)
187         {
188             cuDeviceComputeCapability(&major, &minor, i);
189             getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
190 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
191             if ((major >= 2) && tccDriver)
192 #else
193             if ((major >= 2))
194 #endif
195                 gpuid[count++] = i;
196         }
197         if (count >= 2)
198         {
199             char deviceName0[256], deviceName1[256]; 
200             for (int i = 0; i < count - 1; i++)
201             {
202                 for (int j = i + 1; j < count; j++)
203                 {
204                     cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
205                     cuDeviceGetName(deviceName0, 256, gpuid[i]);
206                     cuDeviceGetName(deviceName1, 256, gpuid[j]);
207                     printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s\n",
208                         deviceName0, gpuid[i], deviceName1, gpuid[j], can_access_peer ? "Yes" : "No"); 
209                 }
210             }
211         }
212     }
213 
214     printf("\nFinish: Result = Pass\n");
215     getchar();
216     return 0;
217 }

▶ 输出结果：

Start.
CUDA Device Query (Driver API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 1070"
  CUDA Driver Version:                           9.0
  CUDA Capability Major/Minor version number:    6.1
  Total amount of global memory:                 8192 MBytes (8589934592 bytes)
  (16) Multiprocessors, (128) CUDA Cores/MP:     2048 CUDA Cores
  GPU Max Clock rate:                            1645 MHz (1.64 GHz)
  Memory Clock rate:                             4004 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 2097152 bytes
  Max Texture Dimension Sizes                    1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size (x,y,z):    (2147483647, 65535, 65535)
  Texture alignment:                             512 bytes
  Maximum memory pitch:                          2147483647 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Supports Cooperative Kernel Launch:            No
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Finish: Result = Pass

▶ 源代码：topologyQuery

 1 #include <cuda_runtime.h>
 2 #include <helper_cuda.h>
 3 #include <helper_functions.h>s
 4 
 5 int main()
 6 {
 7     int deviceCount;
 8     cudaGetDeviceCount(&deviceCount);
 9     for (int device1 = 0; device1 < deviceCount - 1; device1++)// 设备间拓扑
10     {
11         for (int device2 = device1 + 1; device2 < deviceCount; device2++)
12         {            
13             int perfRank = 0;
14             int atomicSupported = 0;
15             int accessSupported = 0;
16             cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2);
17             cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
18             cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2);
19             if (accessSupported)
20             {
21                 std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
22                 std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
23                 std::cout << "  * Perf Rank: " << perfRank << std::endl;
24             }
25         }
26     }
27     for (int device = 0; device < deviceCount; device++)// 设备与主机间间拓扑
28     {
29         int atomicSupported;
30         cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device);
31         std::cout << "GPU" << device << " <-> CPU:" << std::endl;
32         std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
33     }
34     getchar();
35     return 0;
36 }

▶ 输出结果：

GPU0 <-> CPU:
  * Atomic Supported: no

▶ 涨姿势：

● Runtime API 比 Driver API 写起来更简单，且能直接检测的内容不少于 Driver API。

● 用到的 Runtime API 函数和 Driver API 函数。

 1 // cuda_runtime_api.h
 2 extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
 3 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
 4 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
 5 
 6 // cuda_device_runtime_api.h
 7 #define __NV_WEAK__ __declspec(nv_weak)
 8 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
 9 
10 // cuda.h
11 CUresult CUDAAPI cuDeviceGetCount(int *count);
12 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
13 CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
14 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);

posted on 2017-12-14 21:19 爨爨爨好阅读(728) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部