使用cuda常量内存进行性能优化

常量内存是在变量前面加上 __constant__,常量内存用于保存核函数执行期间不会发生变化的数据,NVIDIA向硬件提供了 64KB 的常量内存,在通常情况下,使用常量内存代替全局内存能有效的缩减内存带宽

常量内存的赋值使用 cudaMemcpyToSymbol() 函数

示例代码中我创建了一个结构体,用该结构体来表示向量,其中结构体中有一个成员函数distance用来求某一点与该向量坐标的平方差,程序的目的:有20个向量,给平面坐标系内的所有点(256,256)求值,这个值是该点到这20个向量的平方差之和

首先使用全局内存编写程序

#include <iostream>

// gen random number
inline float rnd(float x)
{
    return x * rand() / RAND_MAX;
}

// check cuda error
inline void check(cudaError_t call, const char* file, const int line)
{
    if (call != cudaSuccess)
    {
        std::cout << "cuda error: " << cudaGetErrorName(call) << std::endl;
        std::cout << "at file: " << file << ", line: " << line << std::endl;
        std::cout << cudaGetErrorString(call) << std::endl;
    }
}

#define CHECK(call) (check(call, __FILE__, __LINE__))

const int row = 256;
const int col = 256;

struct vec
{
    float x, y;
    __device__ float distance(float x1, float y1)
    {
        float dx = x1 - x;
        float dy = y1 - y;
        float dis = dx * dx + dy * dy;
        return dis;
    }
};

__global__ void do_something(vec* vec_set, float* rslt)
{
    int index_x = threadIdx.x + blockDim.x * blockIdx.x;
    int index_y = threadIdx.y + blockDim.y * blockIdx.y;
    int index = index_x + index_y * blockDim.x * gridDim.x;

    for (int i = 0; i < 20; ++i)
    {
        rslt[index] += vec_set[i].distance(index_x, index_y);
    }
}

int main(void)
{
    // host memory
    float* h_rslt = new float[row * col];
    vec* h_vec_set = new vec[20];
    for (int i = 0; i < 20; ++i)
    {
        h_vec_set[i].x = rnd(5.0f);
        h_vec_set[i].y = rnd(5.0f);
    }
    for (int i = 0; i < row * col; ++i)
    {
        h_rslt[i] = 0.0;
    }

    // device memory
    float* d_rslt;
    vec* d_vec_set;

    // malloc and copy on gpu
    CHECK(cudaMalloc((void**)&d_rslt, sizeof(float) * col * row));
    CHECK(cudaMalloc((void**)&d_vec_set, sizeof(vec) * 20));
    CHECK(cudaMemcpy(d_vec_set, h_vec_set, sizeof(vec) * 20, cudaMemcpyHostToDevice));
    dim3 thread(16, 16);
    dim3 block(row/16, col/16);

    // cuda event
    cudaEvent_t start, end;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&end));
    CHECK(cudaEventRecord(start, 0));

    // running kernel function
    do_something<<<block, thread>>>(d_vec_set, d_rslt);
    CHECK(cudaEventRecord(end, 0));
    CHECK(cudaEventSynchronize(end));

    // copy result to host
    CHECK(cudaMemcpy(h_rslt, d_rslt, sizeof(float) * row * col, cudaMemcpyDeviceToHost));
    float elapsed_time;
    CHECK(cudaEventElapsedTime(&elapsed_time, start, end));
    std::cout << "elapsed time: " << elapsed_time << std::endl;
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(end));

    CHECK(cudaFree(d_rslt));
    CHECK(cudaFree(d_vec_set));
    delete[] h_rslt;
    delete[] h_vec_set;
    return 0;
}

运行时间

 

 然后使用常量内存编写

#include <iostream>

inline float rnd(float x)
{
    return x * rand() / RAND_MAX;
}

inline void check(cudaError_t call, const char* file, const int line)
{
    if (call != cudaSuccess)
    {
        std::cout << "cuda error: " << cudaGetErrorName(call) << std::endl;
        std::cout << "at file: " << file << ", line: " << line << std::endl;
        std::cout << cudaGetErrorString(call) << std::endl;
    }
}

#define CHECK(call) (check(call, __FILE__, __LINE__))

const int row = 256;
const int col = 256;

struct vec
{
    float x, y;
    __device__ float distance(float x1, float y1)
    {
        float dx = x1 - x;
        float dy = y1 - y;
        float dis = dx * dx + dy * dy;
        return dis;
    }
};

// constant memory on cuda
__constant__ vec vec_list[20];

__global__ void do_something1(float* rslt)
{
    int index_x = threadIdx.x + blockDim.x * blockIdx.x;
    int index_y = threadIdx.y + blockDim.y * blockIdx.y;
    int index = index_x + index_y * blockDim.x * gridDim.x;
    for (int i = 0; i < 20; ++i)
    {
        rslt[index] += vec_list[i].distance(index_x, index_y);
    }
}

int main(void)
{
    // host memory
    float* h_rslt = new float[row * col];
    vec* h_vec_set = new vec[20];
    for (int i = 0; i < 20; ++i)
    {
        h_vec_set[i].x = rnd(5.0f);
        h_vec_set[i].y = rnd(5.0f);
    }
    for (int i = 0; i < row * col; ++i)
    {
        h_rslt[i] = 0.0;
    }

    // device memory
    float* d_rslt;
    CHECK(cudaMalloc((void**)&d_rslt, sizeof(float) * col * row));
    // copy value to constant memory
    CHECK(cudaMemcpyToSymbol(vec_list, h_vec_set, sizeof(vec) * 20));
    dim3 thread(16, 16);
    dim3 block(row/16, col/16);

    // use cuda event recording kernel function running time
    cudaEvent_t start, end;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&end));
    CHECK(cudaEventRecord(start, 0));

    // running kernel function
    do_something1<<<block, thread>>>(d_rslt);

    // end record event
    CHECK(cudaEventRecord(end, 0));
    CHECK(cudaEventSynchronize(end));
    CHECK(cudaMemcpy(h_rslt, d_rslt, sizeof(float) * row * col, cudaMemcpyDeviceToHost));
    float elapsed_time;
    CHECK(cudaEventElapsedTime(&elapsed_time, start, end));
    std::cout << "elapsed time: " << elapsed_time << std::endl;
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(end));

    // memory free on host and device
    CHECK(cudaFree(d_rslt));
    delete[] h_rslt;
    delete[] h_vec_set;
    return 0;
}

运行时间

 

 相比于全局内存,从常量内存中读取数据可以节约内存带宽,主要有两个原因:

1. 对常量内存的单次读操作可以广播到其他的近邻(Nearby)线程,这将节约15次读取操作

2. 常量内存的数据将缓存起来,因此对相同的地址连续读操作将不会产生额外的内存通信量(缓存命中)

 

近邻线程的概念和warp线程束有关,关线程束warp指的是32个线程作为一个整体,执行同一个指令,步调一致,warp中的每一个线程都在不同数据上执行相同的指令。当处理常量内存时,NVIDIA硬件将把单次内存的读取操作广播到每个半线程束(half-warp)。这个半线程束包含了16个线程,因此当该半个线程束执行指令时,会从该地址上只进行一次读取操作,然后将读取结果广播到其他线程,所以带宽约是全局变量的1/16(6%)。同时如果其他半线程束也在请求该地址,那么会有cache命中,提升的性能会更多。

 

什么情况下常量内存不会提升性能?当half-warp访问常量内存的不同数据时(读取地址不一致),则读取操作会被串行化,耗时变成了原来的16倍。拿本程序来说,如果不是每个像素点都从vec_list的首地址开始读取,而是不同线程读取不同的&vec_list[ random ],那么使用常量内存将不会有优化效果,反而核函数的运行速度会变慢

posted @ 2023-03-28 18:11  Wangtn  阅读(960)  评论(0编辑  收藏  举报