CUDA 教程（三）CUDA C 编程简介

前言上一章我们学会了 CUDA 编程的必备基础知识，本章会为大家简单讲解 CUDA C 编程中的简单的内存管理，线程操作，如何编写核函数，使用 Thrust 库，并行计算，性能分析工具，我们会结合代码详细为大家讲解 CUDA C 编程，更为复杂的操作期待后续章节。

本教程禁止转载。同时，本教程来自知识星球【CV技术指南】更多技术教程，可加入星球学习。

欢迎关注公众号CV技术指南，专注于计算机视觉的技术总结、最新技术跟踪、经典论文解读、CV招聘信息。

CV各大方向专栏与各个部署框架最全教程整理

计算机视觉入门1v3辅导班

一获取 GPU 信息

CUDA 提供了几种获取 GPU 信息的方法，这里介绍一下通过调用 cuda_runtime.h中的 API 得到 GPU 的一些属性。

在编写 CUDA C 程序时，要将文件命名为 *.cu，一般使用 nvcc 命令编译运行，为 CUDA程序文件，支持 C/C++ 语法。

#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
int main() {
    int dev = 0;
    cudaDeviceProp devProp;
    cudaGetDeviceProperties(&devProp, dev);
    std::cout << "GPU Device Name" << dev << ": " << devProp.name << std::endl;
    std::cout << "SM Count: " << devProp.multiProcessorCount << std::endl;
    std::cout << "Shared Memory Size per Thread Block: " << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
    std::cout << "Threads per Thread Block: " << devProp.maxThreadsPerBlock << std::endl;
    std::cout << "Threads per SM: " << devProp.maxThreadsPerMultiProcessor << std::endl;
    std::cout << "Warps per SM: " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
    return 0;
}

编译命令

nvcc checkDeviceInfor.cu -o checkDeviceInfor

输出如下，SM 数量为 30，每个线程块的共享内存为 48KB，每个线程块有 1024 个线程，每个 SM 有 1536 个线程，每个 SM 有 48 个线程束

GPU Device Name0: NVIDIA GeForce RTX 3060 Laptop GPU
SM Count: 30
Shared Memory Size per Thread Block: 48 KB
Threads per Thread Block: 1024
Threads per SM: 1536
Warps per SM: 48

二初步内存管理

主机和设备各自拥有独立的内存，C 拥有标准库可以管理主机的内存，CUDA 提供的 API 管理设备的内存，下面是 C 和 CUDA 的部分内存管理函数

C	CUDA	功能
malloc	cudaMalloc	分配内存
memcpy	cudaMemcpy	复制内存
memset	cudaMemset	设置内存
free	cudaFree	释放内存

主机与设备的数据拷贝

下面的程序举例了如何使用进行主机与设备的数据拷贝，使用了 cudaMalloc，cudaMemcpy 和 cudaFree 函数，函数形参如下

__host__ cudaError_t cudaMalloc (void** devPtr, size_t size)

devPtr: 开辟数据的首指针
size: 开辟的设备内存空间长度

__host__ cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)

dst: 目的数据内存首指针
src: 源数据首指针
count: 数据长度
kind: 拷贝类型，cudaMemcpyDeviceToHost: 从设备向主机拷贝 | cudaMemcpyDeviceToHost: 从主机向设备拷贝 | cudaMemcpyHostToHost: 从主机向主机拷贝 | cudaMemcpyDeviceToDevice: 从设备向设备拷贝

__host__ cudaError_t cudaFree (void* devPtr)
devPtr: 设备变量指针

上述函数的返回值类型都是 cudaError_t，以枚举形式保存各种错误类型

更多运行时函数详解见官方文档

#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <iostream>
#include <math.h>

int main() {
    float dets[6][4] = {
        {23, 34, 56, 76},
        {11, 23, 45, 45},
        {12, 22, 47, 47},
        {9, 45, 56, 65},
        {20, 37, 55, 75},
    };
    // copy data to gpu
    std::cout << sizeof(dets) << std::endl;

    float *dev_dets;
    cudaError_t err = cudaSuccess;
    err = cudaMalloc((void **)&dev_dets, sizeof(dets));
    if (err != cudaSuccess) {
        printf("cudaMalloc failed!");
        return 1;
    }
    cudaMemcpy(dev_dets, dets, sizeof(dets), cudaMemcpyHostToDevice);
    std::cout << "Copied data to GPU.\n";

    // get back copied cuda data
    float host_dets[sizeof(dets)/sizeof(float)];
    cudaMemcpy(&host_dets, dev_dets, sizeof(dets), cudaMemcpyDeviceToHost);
    std::cout << "Copied from cuda back to host.\n";
    std::cout << "host_dets size: " << sizeof(host_dets) << std::endl;
    for (int i=0;i<sizeof(dets)/sizeof(float);i++) {
        std::cout << host_dets[i] << " ";
    }
    std::cout << std::endl;
    cudaFree(dev_dets);

    std::cout << "done.\n";
 return 0;
}

// 输出为
96
Copied data to GPU.
Copied from cuda back to host.
host_dets size: 96
23 34 56 76 11 23 45 45 12 22 47 47 9 45 56 65 20 37 55 75 0 0 0 0 
done.

上面的程序使用cudaMalloc来申请设备内存，但二维数组不推荐这么做，在 kernel 运算时较高的性能损失，CUDA 给出了二维数组专用的内存申请函数cudaMallocPitch，在设备间内存拷贝时，也要使用cudaMemcpy2D函数，形参如下

__host__cudaError_t cudaMallocPitch ( void** devPtr, size_t* pitch, size_t width, size_t height )

devPtr: 开辟矩阵的数据的首指针
pitch: 分配存储器的宽度
width: 二维数组列数
height: 二维数组行数

__host__ cudaError_t cudaMemcpy2D ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind )

dst: 目的矩阵内存首指针
dpitch: dst指向的 2D 数组中的内存宽度，以字节为单位，是cuda为了读取方便，对齐过的内存宽度，可能大于一行元素占据的实际内存
src: 源矩阵内存首指针
spitch: src 指向的 2D 数组中的内存宽度
width: src指向的2D数组中一行元素占据的实际宽度，为 width*sizeof(type)
height: src指向的2D数组的行数
kind: 拷贝类型，cudaMemcpyDeviceToHost: 从设备向主机拷贝 | cudaMemcpyDeviceToHost: 从主机向设备拷贝 | cudaMemcpyHostToHost: 从主机向主机拷贝 | cudaMemcpyDeviceToDevice: 从设备向设备拷贝

#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
#include <iostream>
#include <math.h>

int main() {
    float dets[6][4] = {
        {23, 34, 56, 76},
        {11, 23, 45, 45},
        {12, 22, 47, 47},
        {9, 45, 56, 65},
        {20, 37, 55, 75},
    };
    size_t width = 4;
    size_t height = 6;
    size_t pitch;
    
    std::cout << sizeof(dets) << std::endl;
    float *dev_dets;
    cudaError_t err = cudaSuccess;
    err = cudaMallocPitch((void **)&dev_dets, &pitch, sizeof(float)*width, height);
    if (err != cudaSuccess) {
        printf("cudaMalloc failed!");
        return 1;
    }
    // copy data to gpu
    cudaMemcpy2D(dev_dets, pitch, dets, sizeof(float)*width, sizeof(float)*width, height,cudaMemcpyHostToDevice);
    std::cout << "Copied data to GPU.\n";

    // get back copied cuda data
    float host_dets[sizeof(dets)/sizeof(float)];
    cudaMemcpy2D(&host_dets, sizeof(float)*width, dev_dets, pitch, sizeof(float)*width, height,cudaMemcpyDeviceToHost);
    std::cout << "Copied from cuda back to host.\n";
    std::cout << "host_dets size: " << sizeof(host_dets) << std::endl;
    for (int i=0;i<width*height;i++) {
        std::cout << host_dets[i] << " ";
    }
    std::cout << std::endl;
    cudaFree(dev_dets);

    std::cout << "done.\n";
        return 0;
}

//输出为
96
Copied data to GPU.
Copied from cuda back to host.
host_dets size: 96
23 34 56 76 11 23 45 45 12 22 47 47 9 45 56 65 20 37 55 75 0 0 0 0 
done.

这两个函数应该会使 kernel 的运行时间变短，因为 pitch 对齐后可实现 global 内存联合访问，但cudaMallocPitch和cudaMemcpy2D会变慢，因为比一维的操作多了对齐的考虑

三 Kernel 函数

kernel 限定词

__device__: 在设备上执行，只能在设备上调用；
__global__: 在设备上执行，只能在主机上调用；
__host__: 在主机上执行，只能在主机上调用。

__device__和__global__代表函数在设备上执行，不支持递归，不能在函数体内声明静态变量，静态变量对应于CPU的整个程序生命过程，不能有可变长参数；

__global__和__host__不能一起使用，而__device__和__host__可以一起使用，编译器会在 CPU 和 GPU 各复制一份函数。

不添加限定词时，函数默认为__host__，也就是在主机上执行。

所有的 kernel 函数返回类型都是 void，且 kernel 函数都是异步执行。

kernel 调用方式

__global__ void kernel_func (param list) {
    // ...
}
kernel_func <<<Dg, Db, Ns, S>>> (param list);

<<<Dg, Db, Ns, S>>>: 是运算符内是核函数的执行参数，告诉编译器运行时如何启动核函数
Dg: grid 的维度和尺寸，dim3 类型，意为一个 grid 有多少个 block
Db: block 的维度和尺寸， dim3 类型，意为一个 block 有多少个 thread
Ns: （可选）用于设置每个block除了静态分配的 shared Memory 以外，最多能动态分配的 shared Memory 大小，单位为 byte 不需要动态分配时该值为0或省略不写
S: （可选） cudastream 类型的参数，表示该核函数处在哪个流之中

这里我们实现一下第二章最后的例子，下面的程序使用了cudaDeviceSynchronize和cudaDeviceReset函数，解释如下

__host__ __device__ cudaDeviceSynchronize: 使设备阻塞到完成所有前面请求的任务，CUDA 11.6 后已弃用
__host__ cudaDeviceReset: 显式销毁并清除当前进程中与设备关联的所有资源，资源不能再被访问，可能导致未定义的行为

由于 CUDA printf 的输出存储在缓冲中，后台同步机制会有延时，需要使用上面两个同步函数中任意一个使 printf 函数的内容与主机同步，即可输出

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void printThreadIndex() {
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy*blockDim.x * gridDim.x + ix;
    if(threadIdx.x == 3 && threadIdx.y == 1 && blockIdx.x == 0 && blockIdx.y == 1)
        printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d, %d), global index %2d \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, ix, iy, idx);
}

int main() {
    dim3 grid(2, 3, 1), block(4, 2, 1);
    cudaDeviceSynchronize();
    printThreadIndex<<<grid, block>>>();
    // cudaDeviceSynchronize(); 
    cudaDeviceReset();
    return 0;
}

// 输出为
thread_id (3,1) block_id (0,1) coordinate (3, 3), global index 27

在输出时不能使用 std::cout, std 命名空间不能使用到 GPU 上

四 CUDA 的 Thrust 库

CUDA 的 Thrust 库是基于标准模板库 STL 的 CUDA 的 C++ 模板库，通过与 CUDA C 配合使用，节省了大量优化算法的时间，保证了性能与开发效率，在 CUDA Toolkit 中包含 Thrust，无需额外安装，只需导入相应头文件，在调用时使用 thrust 命名空间，并尽量不要使用 using namespace std; 语句，因为 thrust 库和 STL 库非常多的重名

Vector 容器

Thrust 中定义了主机端和设备端的两种 vector，分别定义在 host_vector.h 和 device_vector.h 中，举例如下

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>

int main() {
    // H has storage for 4 integers
    thrust::host_vector<int> H(4);
    // initialize individual elements
    H[0] = 14;
    H[1] = 20;
    H[2] = 38;
    H[3] = 46;
    H.push_back(52);
    // H.size() returns the size of vector H
    std::cout << "H has size " << H.size() << std::endl;
    // print contents of H
    // for(int i = 0; i < H.size(); i++)
    for(auto i:H)
        std::cout << i << std::endl;
    // resize H
    H.resize(2);
    std::cout << "H now has size " << H.size() << std::endl;
    // Copy host_vector H to device_vector D
    thrust::device_vector<int> D = H; 
    // elements of D can be modified
    D[0] = 99;
    D[1] = 88;
    // print contents of D
    for(auto i:D)
        std::cout << i << std::endl;
    // H and D are automatically deleted when the function returns
    return 0;
}

// 输出为
H has size 5
14
20
38
46
52
H now has size 2
99
88

Thrust 允许使用 = 运算符对 host_vector 和 device_vector 的相互拷贝，也允许使用 [i] 下标访问 device_vector 的各个元素，但是用这种方法访问每一次都需要调用 cudaMemcpy，性能损失较大，应谨慎使用。下面我们将介绍一些更有效的技术

下面展示 Thrust 提供的几种对 vector 操作的方法，包括初始化，赋值，iterator

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <iostream>

int main() {
    // initialize all ten integers of a device_vector to 1
    thrust::device_vector<int> D(10, 1);
    // set the first seven elements of a vector to 9
    thrust::fill(D.begin(), D.begin() + 7, 9);
    // initialize a host_vector with the first five elements of D
    thrust::host_vector<int> H(D.begin(), D.begin() + 5);
    // set the elements of H to 0, 1, 2, 3, ...
    thrust::sequence(H.begin(), H.end());
    // copy all of H back to the beginning of D
    thrust::copy(H.begin(), H.end(), D.begin());
    // print D
    for(auto i:D)dd
        std::cout << i << " ";
    std::cout << std::endl;
    return 0;
}

//输出为
0 1 2 3 4 9 9 1 1 1

上面的程序使用了thrust::fill，当它对 device_vector iterator 操作时，会在编译时检查 iterator 在主机上还是在设备上，这个过程被称为静态调度，意味着调度过程没有运行时开销

指针

thrust 中定义了 device_ptr 数据类型，当传入函数的指针指向设备端内存时，需要用device_ptr进行封装

size_t N = 10;

// raw pointer to device memory
int * raw_ptr;
cudaMalloc((void **) &raw_ptr, N * sizeof(int));
// wrap raw pointer with a device_ptr 
thrust::device_ptr<int> dev_ptr(raw_ptr);
// use device_ptr in thrust algorithms
thrust::fill(dev_ptr, dev_ptr + N, (int) 0);

数值操作

Transformations

Transformations 是对一个输入范围中的每一个元素应用操作，将结果存储在给定范围中的方法，上面程序中已经 thrust::fill 就是一个 Transformations，它将范围内的所有元素设置为指定值。下面的程序用到了 thrust::sequence，thrust::replace，thrust::transform，更多 Transformations 请查看官方文档

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <iostream>

int main() {
    // allocate three device_vectors with 10 elements
    thrust::device_vector<int> X(10);
    thrust::device_vector<int> Y(10);
    thrust::device_vector<int> Z(10);
    // initialize X to 0,1,2,3, ....
    thrust::sequence(X.begin(), X.end());
    // compute Y = -X
    thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>());
    // fill Z with twos
    thrust::fill(Z.begin(), Z.end(), 2);
    // compute Y = X mod 2
    thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>());
    // replace all the ones in Y with tens
    thrust::replace(Y.begin(), Y.end(), 1, 10);
    // print Y
    thrust::copy(Y.begin(), Y.end(), std::ostream_iterator<int>(std::cout, " "));
    return 0;
}

// 输出为
0 10 0 10 0 10 0 10 0 10

SAXPY

SAXPY（Scalar Alpha X Plus Y）是一个在 BLAS（Basic Linear Algebra Subprograms）函数库提供中的函数，并且是一个并行向量处理机（vector processor）中常用的计算操作指令，为标量乘法和向量加法的组合，如

struct saxpy_functor {
    const float a;
    saxpy_functor(float _a) : a(_a) {}
    __host__ __device__
        float operator()(const float& x, const float& y) const { 
            return a * x + y;
        }
};

void saxpy(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y)
{
    // y = a * x + y
    thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
}

Reductions

使用 thrust::reduce 函数对一组数据进行操作，返回值为一个具体数值，下例就是对一组数据求和

int sum = thrust::reduce(D.begin(), D.end(), (int) 0, thrust::plus<int>());

上列中(int) 0为计算的初始值，thrust::plus<int>()为操作符，当没有定义初始值和操作符时，它们是默认值，因此下面的两条语句和上面的等价，更多操作符请查看官方文档

int sum = thrust::reduce(D.begin(), D.end(), (int) 0);
int sum = thrust::reduce(D.begin(), D.end());

thrust::transform_reduce允许接受多个操作符来对一组数据求值

#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <cmath>

// square<T> computes the square of a number f(x) -> x*x
template <typename T>
struct square {
    __host__ __device__
        T operator()(const T& x) const { 
            return x * x;
        }
};

int main() {
    // initialize host array
    float x[4] = {1.0, 2.0, 3.0, 4.0};
    // transfer to device
    thrust::device_vector<float> d_x(x, x + 4);
    // setup arguments
    square<float> unary_op;
    thrust::plus<float> binary_op;
    float init = 0;
    // compute norm
    float norm = std::sqrt( thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op));
    std::cout << norm << std::endl;
    return 0;
}

// 输出为
5.47723

上面的程序对一组数据计算平方和再开方，这种写法会大大优化性能。

Sorting

对数据进行排序，很常用的排序功能，举例如下

#include <thrust/sort.h>
#include <thrust/functional.h>
...
const int N = 6;
int A[N] = {1, 4, 2, 8, 5, 7};
thrust::sort(A, A + N);
// A is now {1, 2, 4, 5, 7, 8}

...
const int N = 6;
int    keys[N] = {  1,   4,   2,   8,   5,   7};
char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'};
thrust::sort_by_key(keys, keys + N, values);
// keys is now   {  1,   2,   4,   5,   7,   8}
// values is now {'a', 'c', 'b', 'e', 'f', 'd'}

...
const int N = 6;
int A[N] = {1, 4, 2, 8, 5, 7};
thrust::stable_sort(A, A + N, thrust::greater<int>());
// A is now {8, 7, 5, 4, 2, 1}

上例中的 thrust::stable_sort接受用户自定义比较运算符

max_element(min_element)

求最大（小）值

#include <thrust/extrema.h>
...
thrust::device_vector<type>::iterator iter = thrust::max_element(dvec.begin()，dvec.end());
int position = iter - dvec.begin();
type max_val = *iter;

其返回值是一个迭代器，需要获取最大（小）值所在位置，再得到结果

unique

将一组数据中满足条件的数据筛选出来，可自定义筛选条件

#include <thrust/unique.h>
...
struct is_same {
 __host__ __device__
  bool operator()(const float3 &p1, const float3 &p2)
 {
  return (p1.x==p2.x) && (p1.y==p2.y) && (p1.z==p2.z);
 }
};

thrust::unique(p.begin(), p.end(),is_same()),p.end();
p.erase(thrust::unique(p.begin(), p.end(),is_sam()),p.end());

unique 函数的功能只是将满足条件的数据筛选出来，无法直接删除，需要结合 vector 的 erase 函数进行删除

五建立 CUDA 的并行线程计算

下面的程序为大家演示以结构体类型存储的矩阵计算，后续章节会教大家使用 cuBLAS 库进行并行计算

矩阵加法

下面的程序进行了

#include<cuda.h>
#include<cuda_runtime.h>
#include<iostream>
#include<stdio.h>
struct Matrix {
    int w;
    int h;
    float *v;
};

__device__ float getValue(Matrix *A, int row, int col) {
        return A->v[row * A->w + col];
}

__device__ void setValue(Matrix *A, int row, int col, float v) {
        A->v[row * A->w + col] = v;
}

__global__ void MatrixAdd(Matrix *A, Matrix *B, Matrix *C) {
        int row = threadIdx.y + blockIdx.y * blockDim.y;
        int col = threadIdx.x + blockIdx.x * blockDim.x;
        setValue(C, row, col, getValue(A, row, col) + getValue(B, row, col));
}

int main() {
    int w = 1 << 20;
    int h = 1 << 20;
    Matrix *A, *B, *C;
    cudaMallocManaged((void**)&A, sizeof(Matrix));
    cudaMallocManaged((void**)&B, sizeof(Matrix));
    cudaMallocManaged((void**)&C, sizeof(Matrix));
    int nBytes = w * h * sizeof(float);
    cudaMallocManaged((void**)&A->v, nBytes);
    cudaMallocManaged((void**)&B->v, nBytes);
    cudaMallocManaged((void**)&C->v, nBytes);

    A->h = h;
    A->w = w;
    B->h = h;
    B->w = w;
    C->h = h;
    C->w = w;
    for (int i = 0; i < w * h; ++i) {
        A->v[i] = 1.0;
        B->v[i] = 2.0;
    }

    dim3 blockSize(32, 32);
    dim3 gridSize((w + blockSize.x - 1) / blockSize.x, (h + blockSize.y - 1) / blockSize.y);
    MatrixAdd << < gridSize, blockSize >> >(A, B, C);
    cudaDeviceSynchronize();
    return 0;
}

矩阵乘法

下面的程序进行了

...
__global__ void MatrixMul(Matrix *A, Matrix *B, Matrix *C) {
        float k = 0.0;
        int row = threadIdx.y + blockIdx.y * blockDim.y;
        int col = threadIdx.x + blockIdx.x * blockDim.x;
        for (int i=0; i<A->w; i++)
                k += getValue(A, row, i) * getValue(B, i, col);
        setValue(C, row, col, k);
}

int main() {
...
    MatrixMul << < gridSize, blockSize >> >(A, B, C);
...
}

六为运行程序计时

nvprof

nvprof 是过去比较常用的命令行工具，但在终端直接输入nvprof ./*.o会得到以下 Warning

======== Warning: nvprof is not supported on devices with compute capability 8.0 and higher.
                  Use NVIDIA Nsight Systems for GPU tracing and CPU sampling and NVIDIA Nsight Compute for GPU profiling.
                  Refer https://developer.nvidia.com/tools-overview for more details.

目前主流的 CUDA 驱动不再支持nvprof命令，但我们仍可以在 NVIDIA Nsight Systems 中使用，在终端输入 nsys nvprof ./*.o就可以看到CUDA 程序执行的具体内容

这里我们以主机与设备的数据拷贝的两个程序为例

使用cudaMalloc函数的程序

WARNING: 1d.o and any of its children processes will be profiled.

96
Copied data to GPU.
Copied from cuda back to host.
host_dets size: 96
23 34 56 76 11 23 45 45 12 22 47 47 9 45 56 65 20 37 55 75 0 0 0 0 
done.
Generating '/tmp/nsys-report-01f4.qdstrm'
[1/7] [========================100%] report5.nsys-rep
[2/7] [========================100%] report5.sqlite
[3/7] Executing 'nvtxsum' stats report
SKIPPED: /root/report5.sqlite does not contain NV Tools Extension (NVTX) data.
[4/7] Executing 'cudaapisum' stats report

 Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)   Max (ns)   StdDev (ns)     Name   
 --------  ---------------  ---------  -----------  -----------  ---------  ---------  -----------  ----------
     99.9        137542088          1  137542088.0  137542088.0  137542088  137542088          0.0  cudaMalloc
      0.1           163239          1     163239.0     163239.0     163239     163239          0.0  cudaFree  
      0.0            36460          2      18230.0      18230.0      18070      18390        226.3  cudaMemcpy

[5/7] Executing 'gpukernsum' stats report
SKIPPED: /root/report5.sqlite does not contain CUDA kernel data.
[6/7] Executing 'gpumemtimesum' stats report

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)      Operation     
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ------------------
     51.6             1504      1    1504.0    1504.0      1504      1504          0.0  [CUDA memcpy HtoD]
     48.4             1408      1    1408.0    1408.0      1408      1408          0.0  [CUDA memcpy DtoH]

[7/7] Executing 'gpumemsizesum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation     
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
      0.000      1     0.000     0.000     0.000     0.000        0.000  [CUDA memcpy DtoH]
      0.000      1     0.000     0.000     0.000     0.000        0.000  [CUDA memcpy HtoD]

Generated:
    /root/report5.nsys-rep
    /root/report5.sqlite

使用cudaMallocPitch函数的程序

WARNING: 2d.o and any of its children processes will be profiled.

96
Copied data to GPU.
Copied from cuda back to host.
host_dets size: 96
23 34 56 76 11 23 45 45 12 22 47 47 9 45 56 65 20 37 55 75 0 0 0 0 
done.
Generating '/tmp/nsys-report-6614.qdstrm'
[1/7] [========================100%] report6.nsys-rep
[2/7] [========================100%] report6.sqlite
[3/7] Executing 'nvtxsum' stats report
SKIPPED: /root/report6.sqlite does not contain NV Tools Extension (NVTX) data.
[4/7] Executing 'cudaapisum' stats report

 Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)   Max (ns)   StdDev (ns)       Name      
 --------  ---------------  ---------  -----------  -----------  ---------  ---------  -----------  ---------------
    100.0        745692893          1  745692893.0  745692893.0  745692893  745692893          0.0  cudaMallocPitch
      0.0           161820          1     161820.0     161820.0     161820     161820          0.0  cudaFree       
      0.0            39090          2      19545.0      19545.0      16590      22500       4179.0  cudaMemcpy2D   

[5/7] Executing 'gpukernsum' stats report
SKIPPED: /root/report6.sqlite does not contain CUDA kernel data.
[6/7] Executing 'gpumemtimesum' stats report

 Time (%)  Total Time (ns)  Count  Avg (ns)  Med (ns)  Min (ns)  Max (ns)  StdDev (ns)      Operation     
 --------  ---------------  -----  --------  --------  --------  --------  -----------  ------------------
     64.8             2880      1    2880.0    2880.0      2880      2880          0.0  [CUDA memcpy HtoD]
     35.2             1567      1    1567.0    1567.0      1567      1567          0.0  [CUDA memcpy DtoH]

[7/7] Executing 'gpumemsizesum' stats report

 Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)      Operation     
 ----------  -----  --------  --------  --------  --------  -----------  ------------------
      0.000      1     0.000     0.000     0.000     0.000        0.000  [CUDA memcpy DtoH]
      0.000      1     0.000     0.000     0.000     0.000        0.000  [CUDA memcpy HtoD]

Generated:
    /root/report6.nsys-rep
    /root/report6.sqlite

这里我们可以看到Total Time中cudaMallocPitch函数用时几乎是cudaMalloc的 5 倍，更加肯定了我们的说法，cudaMallocPitch和cudaMemcpy2D需要额外对二维数据进行对齐操作

cudaEvent 计时函数

以前面的矩阵乘法为例，分别计算 CUDA 开辟内存时间和矩阵乘法运算时间

int main() {
    int w = 1 << 20;
    int h = 1 << 20;
    Matrix *A, *B, *C;
    
    cudaEvent_t start, stop;
    float elapsedTime = 0.0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaMallocManaged((void**)&A, sizeof(Matrix));
    cudaMallocManaged((void**)&B, sizeof(Matrix));
    cudaMallocManaged((void**)&C, sizeof(Matrix));
    int nBytes = w * h * sizeof(float);
    cudaMallocManaged((void**)&A->v, nBytes);
    cudaMallocManaged((void**)&B->v, nBytes);
    cudaMallocManaged((void**)&C->v, nBytes);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&elapsedTime, start, stop);
    std::cout << "cudaMalloc cost: " << elapsedTime << "s" << std::endl;
    
    A->h = h;
    A->w = w;
    B->h = h;
    B->w = w;
    C->h = h;
    C->w = w;
    for (int i = 0; i < w * h; ++i) {
        A->v[i] = 1.0;
        B->v[i] = 2.0;
    }

    dim3 blockSize(32, 32);
    dim3 gridSize((w + blockSize.x - 1) / blockSize.x, (h + blockSize.y - 1) / blockSize.y);

    elapsedTime = 0.0;
    cudaEventRecord(start, 0);

    MatrixMul << < gridSize, blockSize >> >(A, B, C);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    std::cout << "Matrix multiplication cost: " << elapsedTime << "s" << std::endl;
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cudaDeviceSynchronize();
    return 0;
}

// 输出为
cudaMalloc cost: 0.161696s
Matrix multiplication cost: 0s