对比设备线性二维数组和 CUDA 二维数组在纹理引用中的效率
▶ 源代码。分别绑定相同大小的设备线性二维数组和 CUDA 二维数组为纹理引用,做简单的平移操作,重复若干次计算带宽和访问速度。
1 #include <stdio.h> 2 #ifdef _WIN32 3 # define WINDOWS_LEAN_AND_MEAN 4 # define NOMINMAX 5 # include <windows.h> 6 #endif 7 #include <cuda_runtime.h> 8 #include "device_launch_parameters.h" 9 #include <helper_functions.h> 10 #include <helper_cuda.h> 11 12 #define NUM_REPS 100 // test 重复次数 13 #define TILE_DIM 16 // 线程块尺寸 14 15 texture<float, 2, cudaReadModeElementType> texRefPL; 16 texture<float, 2, cudaReadModeElementType> texRefArray; 17 18 __global__ void shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY) 19 { 20 int xid = blockIdx.x * blockDim.x + threadIdx.x; 21 int yid = blockIdx.y * blockDim.y + threadIdx.y; 22 23 odata[yid * pitch + xid] = tex2D(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height); 24 } 25 26 __global__ void shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY) 27 { 28 int xid = blockIdx.x * blockDim.x + threadIdx.x; 29 int yid = blockIdx.y * blockDim.y + threadIdx.y; 30 31 odata[yid * pitch + xid] = tex2D(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height); 32 } 33 34 bool test() 35 { 36 bool result = true; 37 int i, j, ishift, jshift; 38 // 数组大小以及 x,y 方向上的偏移量 39 const int nx = 2048; 40 const int ny = 2048; 41 const int x_shift = 5; 42 const int y_shift = 7; 43 if ((nx % TILE_DIM) || (ny % TILE_DIM)) 44 { 45 printf("nx and ny must be multiples of TILE_DIM\n"); 46 return EXIT_FAILURE; 47 } 48 dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM); 49 50 cudaEvent_t start, stop; 51 cudaEventCreate(&start); 52 cudaEventCreate(&stop); 53 54 //int devID = findCudaDevice(argc, (const char **)argv);// 使用device 0,不再使用命令行参数进行判断 55 56 // 申请内存 57 float *h_idata = (float *)malloc(sizeof(float) * nx * ny); 58 float *h_odata = (float *)malloc(sizeof(float) * nx * ny); 59 float *h_ref = (float *)malloc(sizeof(float) * nx * ny); 60 for (int i = 0; i < nx * ny; ++i) 61 h_idata[i] = (float)i; 62 float *d_idataPL; 63 size_t d_pitchBytes; 64 cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny); 65 cudaArray *d_idataArray; 66 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); 67 cudaMallocArray(&d_idataArray, &channelDesc, nx, ny); 68 float *d_odata; 69 cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny); 70 71 // 拷贝内存(两组) 72 size_t h_pitchBytes = nx * sizeof(float); 73 cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice); 74 cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice); 75 76 // 绑定纹理(两组) 77 texRefPL.normalized = 1; 78 texRefPL.filterMode = cudaFilterModePoint; 79 texRefPL.addressMode[0] = cudaAddressModeWrap; 80 texRefPL.addressMode[1] = cudaAddressModeWrap; 81 cudaBindTexture2D(0, &texRefPL, d_idataPL, &channelDesc, nx, ny, d_pitchBytes); 82 83 texRefArray.normalized = 1; 84 texRefArray.filterMode = cudaFilterModePoint; 85 texRefArray.addressMode[0] = cudaAddressModeWrap; 86 texRefArray.addressMode[1] = cudaAddressModeWrap; 87 cudaBindTextureToArray(texRefArray, d_idataArray, channelDesc); 88 89 // 理论计算结果 90 for (i = 0; i < ny; i++) 91 { 92 for (j = 0; j < nx; ++j) 93 h_ref[i * nx + j] = h_idata[(i + y_shift) % ny * nx + (j + x_shift) % nx]; 94 } 95 96 // 使用线性数组的纹理计算 97 cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny); 98 cudaEventRecord(start, 0); 99 for (int i = 0; i < NUM_REPS; ++i) 100 shiftPitchLinear << <dimGrid, dimBlock >> > (d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift); 101 cudaEventRecord(stop, 0); 102 cudaEventSynchronize(stop); 103 float timePL; 104 cudaEventElapsedTime(&timePL, start, stop); 105 106 // 检查结果 107 cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost); 108 if (!compareData(h_ref, h_odata, nx*ny, 0.0f, 0.15f)) 109 { 110 printf("\n\t ShiftPitchLinear failed\n"); 111 result = false; 112 } 113 114 // 使用 CUDA数组的纹理计算 115 cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny); 116 cudaEventRecord(start, 0); 117 for (int i = 0; i < NUM_REPS; ++i) 118 shiftArray << <dimGrid, dimBlock >> > (d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift); 119 cudaEventRecord(stop, 0); 120 cudaEventSynchronize(stop); 121 float timeArray; 122 cudaEventElapsedTime(&timeArray, start, stop); 123 124 // 检查结果 125 cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost); 126 if (!compareData(h_ref, h_odata, nx*ny, 0.0f, 0.15f)) 127 { 128 printf("\n\tShiftArray failed\n"); 129 result = false; 130 } 131 132 // 计算带宽和读取速度 133 float bandwidthPL = 2.f * nx * ny * sizeof(float) / (timePL / 1000.f / NUM_REPS * 1.e+9f); 134 float bandwidthArray = 2.f * nx * ny * sizeof(float) / (timeArray / 1000.f / NUM_REPS * 1.e+9f); 135 printf("\n\tBandwidth for pitch linear: %.2f GB/s; for array: %.2f GB/s\n", bandwidthPL, bandwidthArray); 136 float fetchRatePL = nx * ny / 1.e+6f / (timePL / 1000.0f / NUM_REPS); 137 float fetchRateArray = nx * ny / 1.e+6f / (timeArray / 1000.0f / NUM_REPS); 138 printf("\n\tTexture fetch rate for pitch linear: %.2f Mpix/s; for array: %.2f Mpix/s\n", fetchRatePL, fetchRateArray); 139 140 // 回收工作 141 free(h_idata); 142 free(h_odata); 143 free(h_ref); 144 cudaUnbindTexture(texRefPL); 145 cudaUnbindTexture(texRefArray); 146 cudaFree(d_idataPL); 147 cudaFreeArray(d_idataArray); 148 cudaFree(d_odata); 149 cudaEventDestroy(start); 150 cudaEventDestroy(stop); 151 152 return result; 153 } 154 155 int main(int argc, char **argv) 156 { 157 printf("\n\tStart\n"); 158 printf("\n\tFinished, %s\n", test() ? "Passed" : "Failed"); 159 160 getchar(); 161 return 0; 162 }
▶ 输出结果
Start Bandwidth for pitch linear: 12.58 GB/s; for array: 14.64 GB/s Texture fetch rate for pitch linear: 1573.09 Mpix/s; for array: 1829.39 Mpix/s Finished, Passed
▶ 涨姿势
● 用到的函数都在以前的,有关线性二维数组和纹理内存使用方法的博客汇总讨论过了。
● 由运行结果可知,使用二维纹理引用时,CUDA 二维数组的效率比线性二维数组更高。