爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

对比设备线性二维数组和 CUDA 二维数组在纹理引用中的效率

▶ 源代码。分别绑定相同大小的设备线性二维数组和 CUDA 二维数组为纹理引用,做简单的平移操作,重复若干次计算带宽和访问速度。

  1 #include <stdio.h>
  2 #ifdef _WIN32
  3 #  define WINDOWS_LEAN_AND_MEAN
  4 #  define NOMINMAX
  5 #  include <windows.h>
  6 #endif
  7 #include <cuda_runtime.h>
  8 #include "device_launch_parameters.h"
  9 #include <helper_functions.h>
 10 #include <helper_cuda.h>
 11 
 12 #define NUM_REPS 100  // test 重复次数
 13 #define TILE_DIM 16   // 线程块尺寸
 14 
 15 texture<float, 2, cudaReadModeElementType> texRefPL;
 16 texture<float, 2, cudaReadModeElementType> texRefArray;
 17 
 18 __global__ void shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY)
 19 {
 20     int xid = blockIdx.x * blockDim.x + threadIdx.x;
 21     int yid = blockIdx.y * blockDim.y + threadIdx.y;
 22 
 23     odata[yid * pitch + xid] = tex2D(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 24 }
 25 
 26 __global__ void shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY)
 27 {
 28     int xid = blockIdx.x * blockDim.x + threadIdx.x;
 29     int yid = blockIdx.y * blockDim.y + threadIdx.y;
 30 
 31     odata[yid * pitch + xid] = tex2D(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 32 }
 33 
 34 bool test()
 35 {
 36     bool result = true;
 37     int i, j, ishift, jshift;
 38     // 数组大小以及 x,y 方向上的偏移量
 39     const int nx = 2048;
 40     const int ny = 2048;
 41     const int x_shift = 5;
 42     const int y_shift = 7;
 43     if ((nx % TILE_DIM) || (ny % TILE_DIM))
 44     {
 45         printf("nx and ny must be multiples of TILE_DIM\n");
 46         return EXIT_FAILURE;
 47     }
 48     dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
 49 
 50     cudaEvent_t start, stop;
 51     cudaEventCreate(&start);
 52     cudaEventCreate(&stop);
 53     
 54     //int devID = findCudaDevice(argc, (const char **)argv);// 使用device 0,不再使用命令行参数进行判断
 55     
 56     // 申请内存
 57     float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
 58     float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
 59     float *h_ref = (float *)malloc(sizeof(float) * nx * ny);
 60     for (int i = 0; i < nx * ny; ++i)
 61         h_idata[i] = (float)i;
 62     float *d_idataPL;
 63     size_t d_pitchBytes;
 64     cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny);
 65     cudaArray *d_idataArray;
 66     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 67     cudaMallocArray(&d_idataArray, &channelDesc, nx, ny);
 68     float *d_odata;
 69     cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny);
 70 
 71     // 拷贝内存(两组)
 72     size_t h_pitchBytes = nx * sizeof(float);
 73     cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice);
 74     cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice);
 75 
 76     // 绑定纹理(两组)
 77     texRefPL.normalized = 1;
 78     texRefPL.filterMode = cudaFilterModePoint;
 79     texRefPL.addressMode[0] = cudaAddressModeWrap;
 80     texRefPL.addressMode[1] = cudaAddressModeWrap;
 81     cudaBindTexture2D(0, &texRefPL, d_idataPL, &channelDesc, nx, ny, d_pitchBytes);
 82 
 83     texRefArray.normalized = 1;
 84     texRefArray.filterMode = cudaFilterModePoint;
 85     texRefArray.addressMode[0] = cudaAddressModeWrap;
 86     texRefArray.addressMode[1] = cudaAddressModeWrap;
 87     cudaBindTextureToArray(texRefArray, d_idataArray, channelDesc);
 88 
 89     // 理论计算结果
 90     for (i = 0; i < ny; i++)
 91     {
 92         for (j = 0; j < nx; ++j)
 93             h_ref[i * nx + j] = h_idata[(i + y_shift) % ny * nx + (j + x_shift) % nx];
 94     }
 95 
 96     // 使用线性数组的纹理计算
 97     cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny);
 98     cudaEventRecord(start, 0);
 99     for (int i = 0; i < NUM_REPS; ++i)
100         shiftPitchLinear << <dimGrid, dimBlock >> > (d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift);
101     cudaEventRecord(stop, 0);
102     cudaEventSynchronize(stop);
103     float timePL;
104     cudaEventElapsedTime(&timePL, start, stop);
105 
106     // 检查结果
107     cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost);
108     if (!compareData(h_ref, h_odata, nx*ny, 0.0f, 0.15f))
109     {
110         printf("\n\t ShiftPitchLinear failed\n");
111         result = false;
112     }
113 
114     // 使用 CUDA数组的纹理计算
115     cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny);
116     cudaEventRecord(start, 0);
117     for (int i = 0; i < NUM_REPS; ++i)
118         shiftArray << <dimGrid, dimBlock >> > (d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift);
119     cudaEventRecord(stop, 0);
120     cudaEventSynchronize(stop);
121     float timeArray;
122     cudaEventElapsedTime(&timeArray, start, stop);
123 
124     // 检查结果
125     cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost);
126     if (!compareData(h_ref, h_odata, nx*ny, 0.0f, 0.15f))
127     {
128         printf("\n\tShiftArray failed\n");
129         result = false;
130     }
131 
132     // 计算带宽和读取速度
133     float bandwidthPL = 2.f * nx * ny * sizeof(float) / (timePL / 1000.f / NUM_REPS * 1.e+9f);
134     float bandwidthArray = 2.f * nx * ny * sizeof(float) / (timeArray / 1000.f / NUM_REPS * 1.e+9f);
135     printf("\n\tBandwidth for pitch linear: %.2f GB/s; for array: %.2f GB/s\n", bandwidthPL, bandwidthArray);
136     float fetchRatePL = nx * ny / 1.e+6f / (timePL / 1000.0f / NUM_REPS); 
137     float fetchRateArray = nx * ny / 1.e+6f / (timeArray / 1000.0f / NUM_REPS); 
138     printf("\n\tTexture fetch rate for pitch linear: %.2f Mpix/s; for array: %.2f Mpix/s\n", fetchRatePL, fetchRateArray);
139 
140     // 回收工作
141     free(h_idata);
142     free(h_odata);
143     free(h_ref);
144     cudaUnbindTexture(texRefPL);
145     cudaUnbindTexture(texRefArray);
146     cudaFree(d_idataPL);
147     cudaFreeArray(d_idataArray);
148     cudaFree(d_odata);
149     cudaEventDestroy(start);
150     cudaEventDestroy(stop);
151 
152     return result;
153 }
154 
155 int main(int argc, char **argv)
156 {
157     printf("\n\tStart\n");
158     printf("\n\tFinished, %s\n", test() ? "Passed" : "Failed");
159 
160     getchar();
161     return 0;
162 }

▶ 输出结果

    Start

    Bandwidth for pitch linear: 12.58 GB/s; for array: 14.64 GB/s

    Texture fetch rate for pitch linear: 1573.09 Mpix/s; for array: 1829.39 Mpix/s

    Finished, Passed

▶ 涨姿势

● 用到的函数都在以前的,有关线性二维数组和纹理内存使用方法的博客汇总讨论过了。

● 由运行结果可知,使用二维纹理引用时,CUDA 二维数组的效率比线性二维数组更高。

posted on 2017-11-25 13:06  爨爨爨好  阅读(381)  评论(0编辑  收藏  举报