环境:Windows11

CUDA Toolkit安装

这个工具提供了一个用于开发高性能,GPU加速的应用的开发环境。

  1. 在cmd窗口中输入nvidia-smi,查看显卡支持的最高CUDA版本以及驱动版本
Wed Dec 25 00:26:58 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.41                 Driver Version: 531.41       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4060 L...  WDDM | 00000000:01:00.0  On |                  N/A |
| N/A   48C    P5                9W /  N/A|   1769MiB /  8188MiB |      5%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
  1. 地址安装CUDA Toolkit,我下载的是CUDA Toolkit 12.0
  2. 下载完后,运行NVIDIA安装程序
    1. 安装选项选择自定义
    2. 在path中添加值:
    xxx\cuda_development\libnvvp
    xxx\cuda_development\bin
    
    1. 在CMD窗口下执行命令:nvcc -V进行CUDA版本的查看
    nvcc -V
    nvcc: NVIDIA (R) Cuda compiler driver
    Copyright (c) 2005-2023 NVIDIA Corporation
    Built on Fri_Jan__6_19:04:39_Pacific_Standard_Time_2023
    Cuda compilation tools, release 12.0, V12.0.140
    Build cuda_12.0.r12.0/compiler.32267302_0
    

基于CMAKE搭建第一个CUDA程序

  1. CMakeLists.txt如下:
#要求最低cmake程序版本
cmake_minimum_required(VERSION 3.8)

#本工程的名字
project(TEST CUDA CXX)

# 设置 CUDA 架构
set(CMAKE_CUDA_ARCHITECTURES 89)
# 设置 C++ 标准
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# 设置 CUDA 标准
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# 添加可执行文件
add_executable(TEST src/test.cu)

# 添加 CUDA 头文件路径
target_include_directories(TEST PRIVATE
    "D:/cuda/cuda_development/include"  # 替换为您的实际 CUDA 头文件路径
)

# 设置 CUDA 分离编译
set_target_properties(TEST PROPERTIES 
    CUDA_SEPARABLE_COMPILATION ON
)

# 如果需要链接其他 CUDA 库
target_link_libraries(TEST PRIVATE 
    cuda
    cudart
)


  1. test.cu如下:
#include <iostream>
#include <cuda_runtime.h>

// CUDA kernel function for vector addition
__global__ void addKernel(int* c, const int* a, const int* b, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Allocate GPU memory
    int* d_a = nullptr;
    int* d_b = nullptr;
    int* d_c = nullptr;
    cudaMalloc((void**)&d_a, arraySize * sizeof(int));
    cudaMalloc((void**)&d_b, arraySize * sizeof(int));
    cudaMalloc((void**)&d_c, arraySize * sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice);

    // Launch CUDA kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (arraySize + threadsPerBlock - 1) / threadsPerBlock;
    addKernel<<<blocksPerGrid, threadsPerBlock>>>(d_c, d_a, d_b, arraySize);

    // Check for kernel launch errors
    cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        std::cerr << "Kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
        return 1;
    }

    // Copy result back to host
    cudaMemcpy(c, d_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);

    // Print results
    std::cout << "Vector addition results:" << std::endl;
    for (int i = 0; i < arraySize; ++i) {
        std::cout << a[i] << " + " << b[i] << " = " << c[i] << std::endl;
    }

    // Free GPU memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Reset device
    cudaDeviceReset();

    return 0;
}

  1. 运行结果如下: