使用cuda常量内存进行性能优化
常量内存是在变量前面加上 __constant__,常量内存用于保存核函数执行期间不会发生变化的数据,NVIDIA向硬件提供了 64KB 的常量内存,在通常情况下,使用常量内存代替全局内存能有效的缩减内存带宽
常量内存的赋值使用 cudaMemcpyToSymbol() 函数
示例代码中我创建了一个结构体,用该结构体来表示向量,其中结构体中有一个成员函数distance用来求某一点与该向量坐标的平方差,程序的目的:有20个向量,给平面坐标系内的所有点(256,256)求值,这个值是该点到这20个向量的平方差之和
首先使用全局内存编写程序
#include <iostream> // gen random number inline float rnd(float x) { return x * rand() / RAND_MAX; } // check cuda error inline void check(cudaError_t call, const char* file, const int line) { if (call != cudaSuccess) { std::cout << "cuda error: " << cudaGetErrorName(call) << std::endl; std::cout << "at file: " << file << ", line: " << line << std::endl; std::cout << cudaGetErrorString(call) << std::endl; } } #define CHECK(call) (check(call, __FILE__, __LINE__)) const int row = 256; const int col = 256; struct vec { float x, y; __device__ float distance(float x1, float y1) { float dx = x1 - x; float dy = y1 - y; float dis = dx * dx + dy * dy; return dis; } }; __global__ void do_something(vec* vec_set, float* rslt) { int index_x = threadIdx.x + blockDim.x * blockIdx.x; int index_y = threadIdx.y + blockDim.y * blockIdx.y; int index = index_x + index_y * blockDim.x * gridDim.x; for (int i = 0; i < 20; ++i) { rslt[index] += vec_set[i].distance(index_x, index_y); } } int main(void) { // host memory float* h_rslt = new float[row * col]; vec* h_vec_set = new vec[20]; for (int i = 0; i < 20; ++i) { h_vec_set[i].x = rnd(5.0f); h_vec_set[i].y = rnd(5.0f); } for (int i = 0; i < row * col; ++i) { h_rslt[i] = 0.0; } // device memory float* d_rslt; vec* d_vec_set; // malloc and copy on gpu CHECK(cudaMalloc((void**)&d_rslt, sizeof(float) * col * row)); CHECK(cudaMalloc((void**)&d_vec_set, sizeof(vec) * 20)); CHECK(cudaMemcpy(d_vec_set, h_vec_set, sizeof(vec) * 20, cudaMemcpyHostToDevice)); dim3 thread(16, 16); dim3 block(row/16, col/16); // cuda event cudaEvent_t start, end; CHECK(cudaEventCreate(&start)); CHECK(cudaEventCreate(&end)); CHECK(cudaEventRecord(start, 0)); // running kernel function do_something<<<block, thread>>>(d_vec_set, d_rslt); CHECK(cudaEventRecord(end, 0)); CHECK(cudaEventSynchronize(end)); // copy result to host CHECK(cudaMemcpy(h_rslt, d_rslt, sizeof(float) * row * col, cudaMemcpyDeviceToHost)); float elapsed_time; CHECK(cudaEventElapsedTime(&elapsed_time, start, end)); std::cout << "elapsed time: " << elapsed_time << std::endl; CHECK(cudaEventDestroy(start)); CHECK(cudaEventDestroy(end)); CHECK(cudaFree(d_rslt)); CHECK(cudaFree(d_vec_set)); delete[] h_rslt; delete[] h_vec_set; return 0; }
运行时间
然后使用常量内存编写
#include <iostream> inline float rnd(float x) { return x * rand() / RAND_MAX; } inline void check(cudaError_t call, const char* file, const int line) { if (call != cudaSuccess) { std::cout << "cuda error: " << cudaGetErrorName(call) << std::endl; std::cout << "at file: " << file << ", line: " << line << std::endl; std::cout << cudaGetErrorString(call) << std::endl; } } #define CHECK(call) (check(call, __FILE__, __LINE__)) const int row = 256; const int col = 256; struct vec { float x, y; __device__ float distance(float x1, float y1) { float dx = x1 - x; float dy = y1 - y; float dis = dx * dx + dy * dy; return dis; } }; // constant memory on cuda __constant__ vec vec_list[20]; __global__ void do_something1(float* rslt) { int index_x = threadIdx.x + blockDim.x * blockIdx.x; int index_y = threadIdx.y + blockDim.y * blockIdx.y; int index = index_x + index_y * blockDim.x * gridDim.x; for (int i = 0; i < 20; ++i) { rslt[index] += vec_list[i].distance(index_x, index_y); } } int main(void) { // host memory float* h_rslt = new float[row * col]; vec* h_vec_set = new vec[20]; for (int i = 0; i < 20; ++i) { h_vec_set[i].x = rnd(5.0f); h_vec_set[i].y = rnd(5.0f); } for (int i = 0; i < row * col; ++i) { h_rslt[i] = 0.0; } // device memory float* d_rslt; CHECK(cudaMalloc((void**)&d_rslt, sizeof(float) * col * row)); // copy value to constant memory CHECK(cudaMemcpyToSymbol(vec_list, h_vec_set, sizeof(vec) * 20)); dim3 thread(16, 16); dim3 block(row/16, col/16); // use cuda event recording kernel function running time cudaEvent_t start, end; CHECK(cudaEventCreate(&start)); CHECK(cudaEventCreate(&end)); CHECK(cudaEventRecord(start, 0)); // running kernel function do_something1<<<block, thread>>>(d_rslt); // end record event CHECK(cudaEventRecord(end, 0)); CHECK(cudaEventSynchronize(end)); CHECK(cudaMemcpy(h_rslt, d_rslt, sizeof(float) * row * col, cudaMemcpyDeviceToHost)); float elapsed_time; CHECK(cudaEventElapsedTime(&elapsed_time, start, end)); std::cout << "elapsed time: " << elapsed_time << std::endl; CHECK(cudaEventDestroy(start)); CHECK(cudaEventDestroy(end)); // memory free on host and device CHECK(cudaFree(d_rslt)); delete[] h_rslt; delete[] h_vec_set; return 0; }
运行时间
相比于全局内存,从常量内存中读取数据可以节约内存带宽,主要有两个原因:
1. 对常量内存的单次读操作可以广播到其他的近邻(Nearby)线程,这将节约15次读取操作
2. 常量内存的数据将缓存起来,因此对相同的地址连续读操作将不会产生额外的内存通信量(缓存命中)
近邻线程的概念和warp线程束有关,关线程束warp指的是32个线程作为一个整体,执行同一个指令,步调一致,warp中的每一个线程都在不同数据上执行相同的指令。当处理常量内存时,NVIDIA硬件将把单次内存的读取操作广播到每个半线程束(half-warp)。这个半线程束包含了16个线程,因此当该半个线程束执行指令时,会从该地址上只进行一次读取操作,然后将读取结果广播到其他线程,所以带宽约是全局变量的1/16(6%)。同时如果其他半线程束也在请求该地址,那么会有cache命中,提升的性能会更多。
什么情况下常量内存不会提升性能?当half-warp访问常量内存的不同数据时(读取地址不一致),则读取操作会被串行化,耗时变成了原来的16倍。拿本程序来说,如果不是每个像素点都从vec_list的首地址开始读取,而是不同线程读取不同的&vec_list[ random ],那么使用常量内存将不会有优化效果,反而核函数的运行速度会变慢