本文基于的情况是,Qt,CUDA和VS已经安装完成且能够正常运行的情况
1.创建一个空的Qt项目
2.创建一个.cu文件,本文创建的为kernel.cu
内容如下
1 #include "cuda_runtime.h"
2 #include "device_launch_parameters.h"
3 #include <stdio.h>
4 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
5 __global__ void addKernel(int *c, const int *a, const int *b)
6 {
7 int i = threadIdx.x;
8 c[i] = a[i] + b[i];
9 }
10 extern "C"
11 void run()
12 {
13 const int arraySize = 5;
14 const int a[arraySize] = { 1, 2, 3, 4, 5 };
15 const int b[arraySize] = { 10, 20, 30, 40, 50 };
16 int c[arraySize] = { 0 };
17 // Add vectors in parallel.
18 cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
19 if (cudaStatus != cudaSuccess) {
20 fprintf(stderr, "addWithCuda failed!");
21 return;
22 }
23 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
24 c[0], c[1], c[2], c[3], c[4]);
25 // cudaDeviceReset must be called before exiting in order for profiling and
26 // tracing tools such as Nsight and Visual Profiler to show complete traces.
27 cudaStatus = cudaDeviceReset();
28 if (cudaStatus != cudaSuccess) {
29 fprintf(stderr, "cudaDeviceReset failed!");
30 return;
31 }
32 // return 0;
33 }
34 // Helper function for using CUDA to add vectors in parallel.
35 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
36 {
37 int *dev_a = 0;
38 int *dev_b = 0;
39 int *dev_c = 0;
40 cudaError_t cudaStatus;
41 // Choose which GPU to run on, change this on a multi-GPU system.
42 cudaStatus = cudaSetDevice(0);
43 if (cudaStatus != cudaSuccess) {
44 fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
45 goto Error;
46 }
47 // Allocate GPU buffers for three vectors (two input, one output) .
48 cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
49 if (cudaStatus != cudaSuccess) {
50 fprintf(stderr, "cudaMalloc failed!");
51 goto Error;
52 }
53 cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
54 if (cudaStatus != cudaSuccess) {
55 fprintf(stderr, "cudaMalloc failed!");
56 goto Error;
57 }
58 cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
59 if (cudaStatus != cudaSuccess) {
60 fprintf(stderr, "cudaMalloc failed!");
61 goto Error;
62 }
63 // Copy input vectors from host memory to GPU buffers.
64 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
65 if (cudaStatus != cudaSuccess) {
66 fprintf(stderr, "cudaMemcpy failed!");
67 goto Error;
68 }
69 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
70 if (cudaStatus != cudaSuccess) {
71 fprintf(stderr, "cudaMemcpy failed!");
72 goto Error;
73 }
74 // Launch a kernel on the GPU with one thread for each element.
75 addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
76 // Check for any errors launching the kernel
77 cudaStatus = cudaGetLastError();
78 if (cudaStatus != cudaSuccess) {
79 fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
80 goto Error;
81 }
82 // cudaDeviceSynchronize waits for the kernel to finish, and returns
83 // any errors encountered during the launch.
84 cudaStatus = cudaDeviceSynchronize();
85 if (cudaStatus != cudaSuccess) {
86 fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
87 goto Error;
88 }
89 // Copy output vector from GPU buffer to host memory.
90 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
91 if (cudaStatus != cudaSuccess) {
92 fprintf(stderr, "cudaMemcpy failed!");
93 goto Error;
94 }
95 Error:
96 cudaFree(dev_c);
97 cudaFree(dev_a);
98 cudaFree(dev_b);
99 return cudaStatus;
100 }
3.编写pro文件
1 CONFIG += console
2
3 TARGET = test
4
5 # Define output directories
6 DESTDIR = ../bin
7 CUDA_OBJECTS_DIR = OBJECTS_DIR/../cuda
8
9 # This makes the .cu files appear in your project
10 CUDA_SOURCES += \
11 kernel.cu
12
13 # MSVCRT link option (static or dynamic, it must be the same with your Qt SDK link option)
14 MSVCRT_LINK_FLAG_DEBUG = "/MDd"
15 MSVCRT_LINK_FLAG_RELEASE = "/MD"
16
17 # CUDA settings
18 CUDA_DIR = $$(CUDA_PATH) # Path to cuda toolkit install
19 SYSTEM_NAME = x64 # Depending on your system either 'Win32', 'x64', or 'Win64'
20 SYSTEM_TYPE = 64 # '32' or '64', depending on your system
21 CUDA_ARCH = sm_50 # Type of CUDA architecture
22 NVCC_OPTIONS = --use_fast_math
23
24 # include paths
25 INCLUDEPATH += $$CUDA_DIR/include \
26 $$CUDA_DIR/common/inc \
27 $$CUDA_DIR/../shared/inc
28
29 # library directories
30 QMAKE_LIBDIR += $$CUDA_DIR/lib/$$SYSTEM_NAME \
31 $$CUDA_DIR/common/lib/$$SYSTEM_NAME \
32 $$CUDA_DIR/../shared/lib/$$SYSTEM_NAME
33
34 # The following makes sure all path names (which often include spaces) are put between quotation marks
35 CUDA_INC = $$join(INCLUDEPATH,'" -I"','-I"','"')
36
37 # Add the necessary libraries
38 CUDA_LIB_NAMES = cudart_static kernel32 user32 gdi32 winspool comdlg32 \
39 advapi32 shell32 ole32 oleaut32 uuid odbc32 odbccp32 \
40 #freeglut glew32
41
42 for(lib, CUDA_LIB_NAMES) {
43 CUDA_LIBS += -l$$lib
44 }
45 LIBS += $$CUDA_LIBS
46
47 # Configuration of the Cuda compiler
48 CONFIG(debug, debug|release) {
49 # Debug mode
50 cuda_d.input = CUDA_SOURCES
51 cuda_d.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.obj
52 cuda_d.commands = $$CUDA_DIR/bin/nvcc.exe -D_DEBUG $$NVCC_OPTIONS $$CUDA_INC $$LIBS \
53 --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH \
54 --compile -cudart static -g -DWIN32 -D_MBCS \
55 -Xcompiler "/wd4819,/EHsc,/W3,/nologo,/Od,/Zi,/RTC1" \
56 -Xcompiler $$MSVCRT_LINK_FLAG_DEBUG \
57 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
58 cuda_d.dependency_type = TYPE_C
59 QMAKE_EXTRA_COMPILERS += cuda_d
60 }
61 else {
62 # Release mode
63 cuda.input = CUDA_SOURCES
64 cuda.output = $$CUDA_OBJECTS_DIR/${QMAKE_FILE_BASE}_cuda.obj
65 cuda.commands = $$CUDA_DIR/bin/nvcc.exe $$NVCC_OPTIONS $$CUDA_INC $$LIBS \
66 --machine $$SYSTEM_TYPE -arch=$$CUDA_ARCH \
67 --compile -cudart static -DWIN32 -D_MBCS \
68 -Xcompiler "/wd4819,/EHsc,/W3,/nologo,/O2,/Zi" \
69 -Xcompiler $$MSVCRT_LINK_FLAG_RELEASE \
70 -c -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
71 cuda.dependency_type = TYPE_C
72 QMAKE_EXTRA_COMPILERS += cuda
73 }
74
75 SOURCES += \
76 main.cpp
需要注意,path中需要有CUDA_PATH的环境变量,如果没有需要自行添加
4.编译运行即可