环境:Windows11
CUDA Toolkit安装
这个工具提供了一个用于开发高性能,GPU加速的应用的开发环境。
- 在cmd窗口中输入
nvidia-smi
,查看显卡支持的最高CUDA版本以及驱动版本
Wed Dec 25 00:26:58 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.41 Driver Version: 531.41 CUDA Version: 12.1 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4060 L... WDDM | 00000000:01:00.0 On | N/A |
| N/A 48C P5 9W / N/A| 1769MiB / 8188MiB | 5% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
- 在地址安装CUDA Toolkit,我下载的是CUDA Toolkit 12.0
- 下载完后,运行NVIDIA安装程序
- 安装选项选择自定义
- 在path中添加值:
xxx\cuda_development\libnvvp xxx\cuda_development\bin
- 在CMD窗口下执行命令:
nvcc -V
进行CUDA版本的查看
nvcc -V nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2023 NVIDIA Corporation Built on Fri_Jan__6_19:04:39_Pacific_Standard_Time_2023 Cuda compilation tools, release 12.0, V12.0.140 Build cuda_12.0.r12.0/compiler.32267302_0
- 安装选项选择自定义
基于CMAKE搭建第一个CUDA程序
- CMakeLists.txt如下:
#要求最低cmake程序版本
cmake_minimum_required(VERSION 3.8)
#本工程的名字
project(TEST CUDA CXX)
# 设置 CUDA 架构
set(CMAKE_CUDA_ARCHITECTURES 89)
# 设置 C++ 标准
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# 设置 CUDA 标准
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
# 添加可执行文件
add_executable(TEST src/test.cu)
# 添加 CUDA 头文件路径
target_include_directories(TEST PRIVATE
"D:/cuda/cuda_development/include" # 替换为您的实际 CUDA 头文件路径
)
# 设置 CUDA 分离编译
set_target_properties(TEST PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)
# 如果需要链接其他 CUDA 库
target_link_libraries(TEST PRIVATE
cuda
cudart
)
- test.cu如下:
#include <iostream>
#include <cuda_runtime.h>
// CUDA kernel function for vector addition
__global__ void addKernel(int* c, const int* a, const int* b, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < size) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Allocate GPU memory
int* d_a = nullptr;
int* d_b = nullptr;
int* d_c = nullptr;
cudaMalloc((void**)&d_a, arraySize * sizeof(int));
cudaMalloc((void**)&d_b, arraySize * sizeof(int));
cudaMalloc((void**)&d_c, arraySize * sizeof(int));
// Copy data from host to device
cudaMemcpy(d_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice);
// Launch CUDA kernel
int threadsPerBlock = 256;
int blocksPerGrid = (arraySize + threadsPerBlock - 1) / threadsPerBlock;
addKernel<<<blocksPerGrid, threadsPerBlock>>>(d_c, d_a, d_b, arraySize);
// Check for kernel launch errors
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
std::cerr << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
return 1;
}
// Copy result back to host
cudaMemcpy(c, d_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
// Print results
std::cout << "Vector addition results:" << std::endl;
for (int i = 0; i < arraySize; ++i) {
std::cout << a[i] << " + " << b[i] << " = " << c[i] << std::endl;
}
// Free GPU memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Reset device
cudaDeviceReset();
return 0;
}
- 运行结果如下: