MindSpore:CUDA编程（五）Event

Event是CUDA中的事件，用于分析、检测CUDA程序中的错误。

一般我们会定义一个宏：

#pragma once
#include <stdio.h>
 
#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

并在适当的位置使用这个宏来打印CUDA的错误日志。#pragma once, 不要放在源代码文件里，这个一般只放在头文件里的。（防止头文件被引入多次）

Event的调用有以下内容：

具体的顺序如下：

（1）声明Event（这里以计算核函数运行时间前后的start Event和stop Event为例）

cudaEvent_t start, stop;

（2）创建Event

CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));

（3）添加Event（在合适的地方）

cudaEventRecord(start);
cudaEventRecord(stop);

（4）等待Event完成

（a）非堵塞方式——可以用于一些不需要等待的处理

cudaEventQuery(start);

（b）堵塞方式——可以用于执行核函数后等待核函数执行完毕后的处理

cudaEventSynchronize(stop);

（5）计算两个Event间隔时间

CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));

（6）销毁Event

CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));

以上次介绍的矩阵乘为例，完整的代码如下：

#pragma once
#include <stdio.h>
 
#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)
 
#include <stdio.h>
#include <math.h>
#include "error.cuh"
 
#define BLOCK_SIZE 32
 
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{ 
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;
    if( col < k && row < m) 
    {
        for(int i = 0; i < n; i++) 
        {
            sum += a[row * n + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
} 
 
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
    for (int i = 0; i < m; ++i) 
    {
        for (int j = 0; j < k; ++j) 
        {
            int tmp = 0.0;
            for (int h = 0; h < n; ++h) 
            {
                tmp += h_a[i * n + h] * h_b[h * k + j];
            }
            h_result[i * k + j] = tmp;
        }
    }
}
 
int main(int argc, char const *argv[])
{
    int m=100;
    int n=100;
    int k=100;
    
    //声明Event
    cudaEvent_t start, stop, stop2, stop3 , stop4 ;
    
    //创建Event
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));
    CHECK(cudaEventCreate(&stop2));
 
    int *h_a, *h_b, *h_c, *h_cc;
    CHECK(cudaMallocHost((void **) &h_a, sizeof(int)*m*n));
    CHECK(cudaMallocHost((void **) &h_b, sizeof(int)*n*k));
    CHECK(cudaMallocHost((void **) &h_c, sizeof(int)*m*k));
    CHECK(cudaMallocHost((void **) &h_cc, sizeof(int)*m*k));
 
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            h_a[i * n + j] = rand() % 1024;
        }
    }
 
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < k; ++j) {
            h_b[i * k + j] = rand() % 1024;
        }
    }
 
    int *d_a, *d_b, *d_c;
    CHECK(cudaMalloc((void **) &d_a, sizeof(int)*m*n));
    CHECK(cudaMalloc((void **) &d_b, sizeof(int)*n*k));
    CHECK(cudaMalloc((void **) &d_c, sizeof(int)*m*k));
 
    // copy matrix A and B from host to device memory
    CHECK(cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice));
 
    unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
    unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
    dim3 dimGrid(grid_cols, grid_rows);
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
   
    //开始start Event
    cudaEventRecord(start);
    //非阻塞模式
    cudaEventQuery(start);
 
    //gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);   
    
    gpu_matrix_mult_shared<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);  
 
    //开始stop Event
    cudaEventRecord(stop);
    //由于要等待核函数执行完毕，所以选择阻塞模式
    cudaEventSynchronize(stop);
    
    //计算时间 stop-start
    float elapsed_time;
    CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
    printf("start-》stop：Time = %g ms.\n", elapsed_time);
 
    
    cudaMemcpy(h_c, d_c, (sizeof(int)*m*k), cudaMemcpyDeviceToHost);
    //cudaThreadSynchronize();
 
    //开始stop2 Event
    CHECK(cudaEventRecord(stop2));
    //非阻塞模式
    //CHECK(cudaEventSynchronize(stop2));
    cudaEventQuery(stop2);
 
    //计算时间 stop-stop2
    float elapsed_time2;
    cudaEventElapsedTime(&elapsed_time2, stop, stop2);
    printf("stop-》stop2：Time = %g ms.\n", elapsed_time2);
 
    //销毁Event
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop));
    CHECK(cudaEventDestroy(stop2));
 
    //CPU函数计算
    cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
 
    int ok = 1;
    for (int i = 0; i < m; ++i)
    {
        for (int j = 0; j < k; ++j)
        {
            if(fabs(h_cc[i*k + j] - h_c[i*k + j])>(1.0e-10))
            {
                
                ok = 0;
            }
        }
    }
 
    if(ok)
    {
        printf("Pass!!!\n");
    }
    else
    {
        printf("Error!!!\n");
    }
 
    // free memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    return 0;
}

在Quardo P1000的GPU上执行：

这里以矩阵乘为例，打印了调用矩阵乘核函数的时间，以及后面 cudaMemcpy的时间。

我们强行将 CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice)); 改为 CHECK(cudaMemcpy(d_b, h_b, sizeof(int)*n*k*2, cudaMemcpyHostToDevice)); 故意让其出界。

再重新编译，运行，看看效果：