ruijiege

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

1.安装cuda

2.安装插件Nsight Visual Studio Code Edition 和c++

3.给VSCode添加头文件的搜索路径 

(55条消息) vscode中配置或添加头文件路径_vscode 配置头文件路径_Markus.Zhao的博客-CSDN博客

 

4.没有提示

blockDim
需要添加头文件
#include <device_launch_parameters.h>

5.查看grid,block的详细信息

找到cuda对应的位置,执行可执行文件

root@xintent-nx:/usr/local/cuda-10.2/samples/bin/aarch64/linux/release# ./deviceQuery 

 6.如果需要debug cuda程序和debug c++程序相同

nvcc -g -G setrun.cu -o set

 

 7.终极计算position的位置公式

#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <device_launch_parameters.h>

// https://www.cnblogs.com/tiandsp/p/9458734.html  这个是反面教材
// compute最终会在gpu上,以3个线程启动进行执行
__global__ void compute(float* a, float* b, float* c){

    /* 
    * The function invokes kernel func on gridDim (gridDim.x × gridDim.y × gridDim.z) grid of blocks. 
    * Each block contains blockDim (blockDim.x × blockDim.y × blockDim.z) threads.
    * gridDim、blockDim、blockIdx、threadIdx是系统内置的变量,可以直接访问 
    * gridDim,对应网格维度,由kernel启动时指定,deviceQuery中明确了,gridDim的最大值是受限的 Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
    * blockDim,每个网格里面块的维度,由kernel启动时指定,deviceQuery中明确了,gridDim的最大值是受限的 Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
    * blockIdx,是核在运行时,所处block的索引
    * threadIdx,是核在运行时,所处thread的索引
    * grid、block是虚拟的,由CUDA的任务调度器管理并分配到真实cuda core中,实际每次启动的线程数由调度器决定
    * gridDim、blockDim都是dim3类型,具有x、y、z属性值,blockIdx、threadIdx类型是uint3,具有x、y、z属性值
    * 哪个线程先运行是不确定的
    */

    int d0 = gridDim.z;
    int d1 = gridDim.y;
    int d2 = gridDim.x;
    int d3 = blockDim.z;
    int d4 = blockDim.y;
    int d5 = blockDim.x;

    // 构成了一个tensor是d0 x d1 x d2 x d3 x d4 x d5
    int p0 = blockIdx.z;
    int p1 = blockIdx.y;
    int p2 = blockIdx.x;
    int p3 = threadIdx.z;
    int p4 = threadIdx.y;
    int p5 = threadIdx.x;

    int position = (((((p0 * d1) + p1) * d2 + p2) * d3 + p3) * d4 + p4) * d5 + p5;

    //int position = ((blockIdx.y * gridDim.x) + blockIdx.x + threadIdx.y) * blockDim.x + threadIdx.x;
    //int position = ((gridDim.x * blockIdx.y + blockIdx.x) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
    //int position = (blockDim.x * blockIdx.x + threadIdx.x);
    c[position] = a[position] * b[position];

    printf("gridDim = %dx%dx%d, blockDim = %dx%dx%d, [blockIdx = %d,%d,%d, threadIdx = %d,%d,%d], position = %d, avalue = %f\n", 
        gridDim.x, gridDim.y, gridDim.z,
        blockDim.x, blockDim.y, blockDim.z,
        blockIdx.x, blockIdx.y, blockIdx.z,
        threadIdx.x, threadIdx.y, threadIdx.z,
        position, a[position]
    );
}

int main(){
    const int num = 16;
    float a[num] = {1, 2, 3};
    float b[num] = {5, 7, 9};
    float c[num] = {0};
    for(int i = 0; i < num; ++i){
        a[i] = i;
        b[i] = i;
    }

    size_t size_array = sizeof(c);
    float* device_a = nullptr;
    float* device_b = nullptr;
    float* device_c = nullptr;
    
    // 分配GPU中的内存,大小是3个float,也就是12个字节
    cudaMalloc(&device_a, size_array);
    cudaMalloc(&device_b, size_array);
    cudaMalloc(&device_c, size_array);
    
    // 把cpu中的数组复制到GPU中
    cudaMemcpy(device_a, a, size_array, cudaMemcpyHostToDevice);
    cudaMemcpy(device_b, b, size_array, cudaMemcpyHostToDevice);
    
    // 启动核函数compute,并以1个block和3个thread的方式进行运行
    compute<<<1, 32>>>(device_a, device_b, device_c);
    
    // 等待核函数执行完毕后,将数据复制到CPU(host)上变量c中
    cudaMemcpy(c, device_c, size_array, cudaMemcpyDeviceToHost);
    
    // 打印c中的数据
    for(int i = 8; i < 8 + 3; ++i){
        printf("value.%d = %f\n", i, c[i]);
    }
    return 0;
}
View Code

 

posted on 2023-07-10 18:10  哦哟这个怎么搞  阅读(1741)  评论(0编辑  收藏  举报