CUDA学习入门-1
关于俺的显卡
NVidia NVS 4200M
support opencl, directx11, DirectCompute, OpenGL 2.1
Memory Amount 1GB
CUDA compute capability 2.1
步骤
1. download cuda5 from https://developer.nvidia.com/thrust
2. install
中间有一步安装toolkit失败。。(为啥呀为啥?)
3. 编译例子程序成功,但是运行失败
F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\CUDA Sam
ples\v5.0\bin\win32\Debug>vectorAdd.exe
[Vector addition of 50000 elements]
Failed to allocate device vector A (error code CUDA driver version is insufficient for CUDA runtime version)! (为啥呀为啥?)
以下是例子程序
/** * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /** * Vector addition: C = A + B. * * This sample is a very basic sample that implements element by element * vector addition. It is the same as the sample illustrating Chapter 2 * of the programming guide with some additions like error checking. */ #include <stdio.h> // For the CUDA runtime routines (prefixed with "cuda_") #include <cuda_runtime.h> /** * CUDA Kernel Device code * * Computes the vector addition of A and B into C. The 3 vectors have the same * number of elements numElements. */ __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) { C[i] = A[i] + B[i]; } } /** * Host main routine */ int main(void) { // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Print the vector length to be used, and compute its size int numElements = 50000; size_t size = numElements * sizeof(float); printf("[Vector addition of %d elements]\n", numElements); // Allocate the host input vector A float *h_A = (float *)malloc(size); // Allocate the host input vector B float *h_B = (float *)malloc(size); // Allocate the host output vector C float *h_C = (float *)malloc(size); // Verify that allocations succeeded if (h_A == NULL || h_B == NULL || h_C == NULL) { fprintf(stderr, "Failed to allocate host vectors!\n"); exit(EXIT_FAILURE); } // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = rand()/(float)RAND_MAX; h_B[i] = rand()/(float)RAND_MAX; } // Allocate the device input vector A float *d_A = NULL; err = cudaMalloc((void **)&d_A, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Allocate the device input vector B float *d_B = NULL; err = cudaMalloc((void **)&d_B, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Allocate the device output vector C float *d_C = NULL; err = cudaMalloc((void **)&d_C, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the host input vectors A and B in host memory to the device input vectors in // device memory printf("Copy input data from the host memory to the CUDA device\n"); err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the device result vector in device memory to the host result vector // in host memory. printf("Copy output data from the CUDA device to the host memory\n"); err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); } } // Free device global memory err = cudaFree(d_A); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaFree(d_B); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaFree(d_C); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Free host memory free(h_A); free(h_B); free(h_C); // Reset the device and exit err = cudaDeviceReset(); if (err != cudaSuccess) { fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } printf("Done\n"); return 0; }
4. 安装后,有个DeviceQuery.exe,运行失败
F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA G PU Computing SDK 4.0\C\bin\win32\Release>deviceQuery.exe [deviceQuery.exe] starting... deviceQuery.exe Starting... CUDA Device Query (Runtime API) version (CUDART static linking) cudaGetDeviceCount returned 35 -> CUDA driver version is insufficient for CUDA runtime version [deviceQuery.exe] test results... FAILED
5. 俺看到还有个oclDeviceQuery.exe,这应该是opencl版本的测试程序。点了一下这个能运行,以下是输出信息。
[oclDeviceQuery.exe] starting... F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA G PU Computing SDK 4.0\OpenCL\Bin\Win32\release\oclDeviceQuery.exe Starting... OpenCL SW Info: CL_PLATFORM_NAME: NVIDIA CUDA CL_PLATFORM_VERSION: OpenCL 1.0 CUDA 3.2.1 OpenCL SDK Revision: 7027912 OpenCL Device Info: 1 devices found supporting OpenCL: --------------------------------- Device NVS 4200M --------------------------------- CL_DEVICE_NAME: NVS 4200M CL_DEVICE_VENDOR: NVIDIA Corporation CL_DRIVER_VERSION: 268.24 CL_DEVICE_VERSION: OpenCL 1.0 CUDA CL_DEVICE_TYPE: CL_DEVICE_TYPE_GPU CL_DEVICE_MAX_COMPUTE_UNITS: 1 CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3 CL_DEVICE_MAX_WORK_ITEM_SIZES: 1024 / 1024 / 64 CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024 CL_DEVICE_MAX_CLOCK_FREQUENCY: 1620 MHz CL_DEVICE_ADDRESS_BITS: 32 CL_DEVICE_MAX_MEM_ALLOC_SIZE: 255 MByte CL_DEVICE_GLOBAL_MEM_SIZE: 1023 MByte CL_DEVICE_ERROR_CORRECTION_SUPPORT: no CL_DEVICE_LOCAL_MEM_TYPE: local CL_DEVICE_LOCAL_MEM_SIZE: 48 KByte CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: 64 KByte CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE CL_DEVICE_QUEUE_PROPERTIES: CL_QUEUE_PROFILING_ENABLE CL_DEVICE_IMAGE_SUPPORT: 1 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8 CL_DEVICE_SINGLE_FP_CONFIG: denorms INF-quietNaNs round-to-nearest r ound-to-zero round-to-inf fma CL_DEVICE_IMAGE <dim> 2D_MAX_WIDTH 4096 2D_MAX_HEIGHT 32768 3D_MAX_WIDTH 2048 3D_MAX_HEIGHT 2048 3D_MAX_DEPTH 2048 CL_DEVICE_EXTENSIONS: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_d3d9_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 CL_DEVICE_COMPUTE_CAPABILITY_NV: 2.1 NUMBER OF MULTIPROCESSORS: 1 NUMBER OF CUDA CORES: 48 CL_DEVICE_REGISTERS_PER_BLOCK_NV: 32768 CL_DEVICE_WARP_SIZE_NV: 32 CL_DEVICE_GPU_OVERLAP_NV: CL_TRUE CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV: CL_TRUE CL_DEVICE_INTEGRATED_MEMORY_NV: CL_FALSE CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t> CHAR 1, SHORT 1, INT 1, LONG 1, FLOAT 1, DOUBLE 1 --------------------------------- 2D Image Formats Supported (71) --------------------------------- # Channel Order Channel Type 1 CL_R CL_FLOAT 2 CL_R CL_HALF_FLOAT 3 CL_R CL_UNORM_INT8 4 CL_R CL_UNORM_INT16 5 CL_R CL_SNORM_INT16 6 CL_R CL_SIGNED_INT8 7 CL_R CL_SIGNED_INT16 8 CL_R CL_SIGNED_INT32 9 CL_R CL_UNSIGNED_INT8 10 CL_R CL_UNSIGNED_INT16 11 CL_R CL_UNSIGNED_INT32 12 CL_A CL_FLOAT 13 CL_A CL_HALF_FLOAT 14 CL_A CL_UNORM_INT8 15 CL_A CL_UNORM_INT16 16 CL_A CL_SNORM_INT16 17 CL_A CL_SIGNED_INT8 18 CL_A CL_SIGNED_INT16 19 CL_A CL_SIGNED_INT32 20 CL_A CL_UNSIGNED_INT8 21 CL_A CL_UNSIGNED_INT16 22 CL_A CL_UNSIGNED_INT32 23 CL_RG CL_FLOAT 24 CL_RG CL_HALF_FLOAT 25 CL_RG CL_UNORM_INT8 26 CL_RG CL_UNORM_INT16 27 CL_RG CL_SNORM_INT16 28 CL_RG CL_SIGNED_INT8 29 CL_RG CL_SIGNED_INT16 30 CL_RG CL_SIGNED_INT32 31 CL_RG CL_UNSIGNED_INT8 32 CL_RG CL_UNSIGNED_INT16 33 CL_RG CL_UNSIGNED_INT32 34 CL_RA CL_FLOAT 35 CL_RA CL_HALF_FLOAT 36 CL_RA CL_UNORM_INT8 37 CL_RA CL_UNORM_INT16 38 CL_RA CL_SNORM_INT16 39 CL_RA CL_SIGNED_INT8 40 CL_RA CL_SIGNED_INT16 41 CL_RA CL_SIGNED_INT32 42 CL_RA CL_UNSIGNED_INT8 43 CL_RA CL_UNSIGNED_INT16 44 CL_RA CL_UNSIGNED_INT32 45 CL_RGBA CL_FLOAT 46 CL_RGBA CL_HALF_FLOAT 47 CL_RGBA CL_UNORM_INT8 48 CL_RGBA CL_UNORM_INT16 49 CL_RGBA CL_SNORM_INT16 50 CL_RGBA CL_SIGNED_INT8 51 CL_RGBA CL_SIGNED_INT16 52 CL_RGBA CL_SIGNED_INT32 53 CL_RGBA CL_UNSIGNED_INT8 54 CL_RGBA CL_UNSIGNED_INT16 55 CL_RGBA CL_UNSIGNED_INT32 56 CL_BGRA CL_UNORM_INT8 57 CL_BGRA CL_SIGNED_INT8 58 CL_BGRA CL_UNSIGNED_INT8 59 CL_ARGB CL_UNORM_INT8 60 CL_ARGB CL_SIGNED_INT8 61 CL_ARGB CL_UNSIGNED_INT8 62 CL_INTENSITY CL_FLOAT 63 CL_INTENSITY CL_HALF_FLOAT 64 CL_INTENSITY CL_UNORM_INT8 65 CL_INTENSITY CL_UNORM_INT16 66 CL_INTENSITY CL_SNORM_INT16 67 CL_LUMINANCE CL_FLOAT 68 CL_LUMINANCE CL_HALF_FLOAT 69 CL_LUMINANCE CL_UNORM_INT8 70 CL_LUMINANCE CL_UNORM_INT16 71 CL_LUMINANCE CL_SNORM_INT16 --------------------------------- 3D Image Formats Supported (71) --------------------------------- # Channel Order Channel Type 1 CL_R CL_FLOAT 2 CL_R CL_HALF_FLOAT 3 CL_R CL_UNORM_INT8 4 CL_R CL_UNORM_INT16 5 CL_R CL_SNORM_INT16 6 CL_R CL_SIGNED_INT8 7 CL_R CL_SIGNED_INT16 8 CL_R CL_SIGNED_INT32 9 CL_R CL_UNSIGNED_INT8 10 CL_R CL_UNSIGNED_INT16 11 CL_R CL_UNSIGNED_INT32 12 CL_A CL_FLOAT 13 CL_A CL_HALF_FLOAT 14 CL_A CL_UNORM_INT8 15 CL_A CL_UNORM_INT16 16 CL_A CL_SNORM_INT16 17 CL_A CL_SIGNED_INT8 18 CL_A CL_SIGNED_INT16 19 CL_A CL_SIGNED_INT32 20 CL_A CL_UNSIGNED_INT8 21 CL_A CL_UNSIGNED_INT16 22 CL_A CL_UNSIGNED_INT32 23 CL_RG CL_FLOAT 24 CL_RG CL_HALF_FLOAT 25 CL_RG CL_UNORM_INT8 26 CL_RG CL_UNORM_INT16 27 CL_RG CL_SNORM_INT16 28 CL_RG CL_SIGNED_INT8 29 CL_RG CL_SIGNED_INT16 30 CL_RG CL_SIGNED_INT32 31 CL_RG CL_UNSIGNED_INT8 32 CL_RG CL_UNSIGNED_INT16 33 CL_RG CL_UNSIGNED_INT32 34 CL_RA CL_FLOAT 35 CL_RA CL_HALF_FLOAT 36 CL_RA CL_UNORM_INT8 37 CL_RA CL_UNORM_INT16 38 CL_RA CL_SNORM_INT16 39 CL_RA CL_SIGNED_INT8 40 CL_RA CL_SIGNED_INT16 41 CL_RA CL_SIGNED_INT32 42 CL_RA CL_UNSIGNED_INT8 43 CL_RA CL_UNSIGNED_INT16 44 CL_RA CL_UNSIGNED_INT32 45 CL_RGBA CL_FLOAT 46 CL_RGBA CL_HALF_FLOAT 47 CL_RGBA CL_UNORM_INT8 48 CL_RGBA CL_UNORM_INT16 49 CL_RGBA CL_SNORM_INT16 50 CL_RGBA CL_SIGNED_INT8 51 CL_RGBA CL_SIGNED_INT16 52 CL_RGBA CL_SIGNED_INT32 53 CL_RGBA CL_UNSIGNED_INT8 54 CL_RGBA CL_UNSIGNED_INT16 55 CL_RGBA CL_UNSIGNED_INT32 56 CL_BGRA CL_UNORM_INT8 57 CL_BGRA CL_SIGNED_INT8 58 CL_BGRA CL_UNSIGNED_INT8 59 CL_ARGB CL_UNORM_INT8 60 CL_ARGB CL_SIGNED_INT8 61 CL_ARGB CL_UNSIGNED_INT8 62 CL_INTENSITY CL_FLOAT 63 CL_INTENSITY CL_HALF_FLOAT 64 CL_INTENSITY CL_UNORM_INT8 65 CL_INTENSITY CL_UNORM_INT16 66 CL_INTENSITY CL_SNORM_INT16 67 CL_LUMINANCE CL_FLOAT 68 CL_LUMINANCE CL_HALF_FLOAT 69 CL_LUMINANCE CL_UNORM_INT8 70 CL_LUMINANCE CL_UNORM_INT16 71 CL_LUMINANCE CL_SNORM_INT16 oclDeviceQuery, Platform Name = NVIDIA CUDA, Platform Version = OpenCL 1.0 CUDA 3.2.1, SDK Revision = 7027912, NumDevs = 1, Device = NVS 4200M System Info: Local Time/Date = 13:19:53, 5/25/2013 CPU Arch: 0 CPU Level: 6 # of CPU processors: 4 Windows Build: 2600 Windows Ver: 5.1 [oclDeviceQuery.exe] test results... PASSED Press ENTER to exit...
6. opencl的带宽测试程序
PU Computing SDK 4.0\OpenCL\Bin\Win32\release\oclBandwidthTest.exe Starting... Running on... NVS 4200M Quick Mode Host to Device Bandwidth, 1 Device(s), Paged memory, direct access Transfer Size (Bytes) Bandwidth(MB/s) 33554432 4131.2 Device to Host Bandwidth, 1 Device(s), Paged memory, direct access Transfer Size (Bytes) Bandwidth(MB/s) 33554432 3485.6 Device to Device Bandwidth, 1 Device(s) Transfer Size (Bytes) Bandwidth(MB/s) 33554432 8901.4 [oclBandwidthTest.exe] test results... PASSED
7 关于安装driver失败,在stackoverflow上有个人说
http://stackoverflow.com/questions/11913320/installing-cuda-nvidia-graphic-driver-failed I have a VAIO too and I had the same problem. Don't download notebook version, try Desktop version of Nvidia Driver. I also had to disable my another Graphic card (Intel). It worked for me.
不过也有人说要修改inf文件才行
Unfortunately, there are many NVIDIA GPUs for which the driver from the NVIDIA website will not install (especially for GPU versions that are specifically OEM'd for Sony, Lenovo, etc and the OEM wants to control the driver experience). This is most likely the case for you. In those cases, you can edit the .inf file to add your GPU into the list of GPUs for which the driver will install. However, it is a bit tricky and typically requires editing 3 different sections of the INF file. You can search around for details on how to mod NVIDIA inf files; there are a number of sites that do that. Of course, you have to have the appropriate CUDA driver before you can run CUDA stuff. So first things first... you've gotta get the driver installed.
这些俺暂时没有测试过是否有效
8. 既然cuda用不了,而opencl貌似可以
那俺还是转移到opencl上吧,首先测试一个例子 http://www.kimicat.com/opencl-1/opencl-jiao-xue-yi
// OpenCL tutorial 1 #include <iostream> #include <string> #include <vector> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif int main() { cl_int err; cl_uint num; err = clGetPlatformIDs(0, 0, &num); if(err != CL_SUCCESS) { std::cerr << "Unable to get platforms\n"; return 0; } std::vector<cl_platform_id> platforms(num); err = clGetPlatformIDs(num, &platforms[0], &num); if(err != CL_SUCCESS) { std::cerr << "Unable to get platform ID\n"; return 0; } cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[0]), 0 }; cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_DEFAULT, NULL, NULL, NULL); if(context == 0) { std::cerr << "Can't create OpenCL context\n"; return 0; } size_t cb; clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); std::vector<cl_device_id> devices(cb / sizeof(cl_device_id)); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, &devices[0], 0); clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 0, NULL, &cb); std::string devname; devname.resize(cb); clGetDeviceInfo(devices[0], CL_DEVICE_NAME, cb, &devname[0], 0); std::cout << "Device: " << devname.c_str() << "\n"; clReleaseContext(context); return 0; }
注意设置好路径
F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.0\OpenCL\common\inc
F:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK 4.0\OpenCL\common\lib\Win32
正常编译运行成功!
Device: NVS 4200M
请按任意键继续. . .
9. 注: 后来俺在bios里面把intergrated GPU disable后,成功安装307.83-quadro-notebook-winxp-32bit-international-whql.exe
安装cuda_5.0.35_winxp_general_32-3.msi还是有错误
但是执行cuda程序貌似都正常了
以下是bandwidthTest.exe测试结果,比opencl版本的快了很多
[CUDA Bandwidth Test] - Starting... Running on... Device 0: NVS 4200M Quick Mode Host to Device Bandwidth, 1 Device(s) PINNED Memory Transfers Transfer Size (Bytes) Bandwidth(MB/s) 33554432 6241.5 Device to Host Bandwidth, 1 Device(s) PINNED Memory Transfers Transfer Size (Bytes) Bandwidth(MB/s) 33554432 6302.9 Device to Device Bandwidth, 1 Device(s) PINNED Memory Transfers Transfer Size (Bytes) Bandwidth(MB/s) 33554432 10330.3
devicequery结果
ples\v5.0\bin\win32\Release\deviceQuery.exe Starting... CUDA Device Query (Runtime API) version (CUDART static linking) Detected 1 CUDA Capable device(s) Device 0: "NVS 4200M" CUDA Driver Version / Runtime Version 5.0 / 5.0 CUDA Capability Major/Minor version number: 2.1 Total amount of global memory: 1024 MBytes (1073283072 bytes) ( 1) Multiprocessors x ( 48) CUDA Cores/MP: 48 CUDA Cores GPU Clock rate: 1620 MHz (1.62 GHz) Memory Clock rate: 800 Mhz Memory Bus Width: 64-bit L2 Cache Size: 65536 bytes Max Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536,65535), 3 D=(2048,2048,2048) Max Layered Texture Size (dim) x layers 1D=(16384) x 2048, 2D=(16384,16 384) x 2048 Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 32768 Warp size: 32 Maximum number of threads per multiprocessor: 1536 Maximum number of threads per block: 1024 Maximum sizes of each dimension of a block: 1024 x 1024 x 64 Maximum sizes of each dimension of a grid: 65535 x 65535 x 65535 Maximum memory pitch: 2147483647 bytes Texture alignment: 512 bytes Concurrent copy and kernel execution: Yes with 1 copy engine(s) Run time limit on kernels: Yes Integrated GPU sharing Host Memory: No Support host page-locked memory mapping: Yes Alignment requirement for Surfaces: Yes Device has ECC support: Disabled CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Mo del) Device supports Unified Addressing (UVA): No Device PCI Bus ID / PCI location ID: 1 / 0 Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simu ltaneously) > deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.0, CUDA Runtime Versi on = 5.0, NumDevs = 1, Device0 = NVS 4200M