立方体纹理贴图
▶ 源代码。用纹理方法把元素按原顺序从 CUDA3D 数组中取出来,求个相反数放入全局内存,输出。
1 #include <stdio.h> 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include <helper_functions.h> 5 #include <helper_cuda.h> 6 7 #define MIN_EPSILON_ERROR 5e-3f 8 9 texture<float, cudaTextureTypeCubemap> tex; 10 11 __global__ void transformKernel(float *g_odata, int width) 12 { 13 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 14 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 15 16 float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;// [0, width-1] 间隔 1 的坐标变换为 [-1+1/width,1-1/width] 间隔 1/width 的坐标 17 float v = ((y + 0.5f) / (float)width) * 2.f - 1.f; 18 19 float cx, cy, cz; 20 21 for (unsigned int face = 0; face < 6; face++) 22 { 23 if (face == 0)// x 正层 24 { 25 cx = 1; 26 cy = -v; 27 cz = -u; 28 } 29 else if (face == 1)// x 负层 30 { 31 cx = -1; 32 cy = -v; 33 cz = u; 34 } 35 else if (face == 2)// y 正层 36 { 37 cx = u; 38 cy = 1; 39 cz = v; 40 } 41 else if (face == 3)// y 负层 42 { 43 cx = u; 44 cy = -1; 45 cz = -v; 46 } 47 else if (face == 4)// z 正层 48 { 49 cx = u; 50 cy = -v; 51 cz = 1; 52 } 53 else if (face == 5)// z 负层 54 { 55 cx = -u; 56 cy = -v; 57 cz = -1; 58 } 59 g_odata[face*width*width + y*width + x] = - texCubemap(tex, cx, cy, cz);// 纹理数据读取到全局内存中输出 60 } 61 } 62 63 int main(int argc, char** argv) 64 { 65 unsigned int width = 64, num_faces = 6, num_layers = 1; 66 unsigned int cubemap_size = width * width * num_faces; 67 unsigned int size = cubemap_size * num_layers * sizeof(float); 68 float *h_data = (float *)malloc(size); 69 float *h_data_ref = (float *)malloc(size); // 理论输出 70 float *d_data = NULL; 71 cudaMalloc((void **)&d_data, size); 72 73 for (int i = 0; i < (int)(cubemap_size * num_layers); i++) 74 h_data[i] = (float)i; 75 for (unsigned int layer = 0; layer < num_layers; layer++) 76 { 77 for (int i = 0; i < (int)(cubemap_size); i++) 78 h_data_ref[layer*cubemap_size + i] = -h_data[layer*cubemap_size + i] + layer; 79 } 80 81 printf("\n\t\Input data.n\t"); 82 for (int i = 0; i < width * num_faces * num_layers; i++) 83 { 84 printf("%2.1f ", h_data[i]); 85 if ((i + 1) % width == 0) 86 printf("\n\t"); 87 if ((i + 1) % (width *width) == 0) 88 printf("\n\t"); 89 } 90 printf("\n\tIdeal output data\n\t"); 91 for (int i = 0; i < width * num_faces * num_layers; i++) 92 { 93 printf("%2.1f ", h_data_ref[i]); 94 if ((i + 1) % width == 0) 95 printf("\n\t"); 96 if ((i + 1) % (width *width) == 0) 97 printf("\n\t"); 98 } 99 100 // 设置 CUDA 3D 数组参数和数据拷贝 101 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 102 cudaArray *cu_3darray; 103 cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap); 104 cudaMemcpy3DParms myparms = { 0 }; 105 myparms.srcPos = make_cudaPos(0, 0, 0); 106 myparms.dstPos = make_cudaPos(0, 0, 0); 107 myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width); 108 myparms.dstArray = cu_3darray; 109 myparms.extent = make_cudaExtent(width, width, num_faces); 110 myparms.kind = cudaMemcpyHostToDevice; 111 cudaMemcpy3D(&myparms); 112 113 // 设置纹理参数并绑定 114 tex.addressMode[0] = cudaAddressModeWrap; 115 tex.addressMode[1] = cudaAddressModeWrap; 116 tex.filterMode = cudaFilterModeLinear; 117 tex.normalized = true; 118 cudaBindTextureToArray(tex, cu_3darray, channelDesc); 119 120 dim3 dimBlock(8, 8, 1); 121 dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1); 122 printf("\n\tCubemap data of %d * %d * %d: Grid size is %d x %d, each block has 8 x 8 threads.\n", width, width, num_layers, dimGrid.x, dimGrid.y); 123 transformKernel << < dimGrid, dimBlock >> >(d_data, width);// 预跑 124 cudaDeviceSynchronize(); 125 126 StopWatchInterface *timer = NULL;// 新的计时工具 127 sdkCreateTimer(&timer); 128 sdkStartTimer(&timer); 129 130 transformKernel << < dimGrid, dimBlock, 0 >> >(d_data, width); 131 cudaDeviceSynchronize(); 132 133 sdkStopTimer(&timer); 134 printf("\n\Time: %.3f msec, %.2f Mtexlookups/sec\n", sdkGetTimerValue(&timer), (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6)); 135 sdkDeleteTimer(&timer); 136 137 // 返回计算结果并检验 138 memset(h_data, 0, size); 139 cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); 140 if (checkCmdLineFlag(argc, (const char **)argv, "regression")) 141 sdkWriteFile<float>("./data/regression.dat", h_data, width * width, 0.0f, false); 142 else 143 printf("Comparing kernel output to expected data return %d\n", compareData(h_data, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f)); 144 145 printf("\n\tActual output data\n\t"); 146 for (int i = 0; i < width * num_faces * num_layers; i++) 147 { 148 printf("%2.1f ", h_data[i]); 149 if ((i + 1) % width == 0) 150 printf("\n\t"); 151 if ((i + 1) % (width * width) == 0) 152 printf("\n\t"); 153 } 154 155 free(h_data); 156 free(h_data_ref); 157 cudaFree(d_data); 158 cudaFreeArray(cu_3darray); 159 160 getchar(); 161 return 0; 162 }
▶ 输出结果
Input data.n 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 24.0 25.0 26.0 27.0 28.0 29.0 30.0 31.0 32.0 33.0 34.0 35.0 36.0 37.0 38.0 39.0 40.0 41.0 42.0 43.0 44.0 45.0 46.0 47.0 48.0 49.0 50.0 51.0 52.0 53.0 54.0 55.0 56.0 57.0 58.0 59.0 60.0 61.0 62.0 63.0 64.0 65.0 66.0 67.0 68.0 69.0 70.0 71.0 72.0 73.0 74.0 75.0 76.0 77.0 78.0 79.0 80.0 81.0 82.0 83.0 84.0 85.0 86.0 87.0 88.0 89.0 90.0 91.0 92.0 93.0 94.0 95.0 96.0 97.0 98.0 99.0 100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 111.0 112.0 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 122.0 123.0 124.0 125.0 126.0 127.0 128.0 129.0 130.0 131.0 132.0 133.0 134.0 135.0 136.0 137.0 138.0 139.0 140.0 141.0 142.0 143.0 144.0 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 155.0 156.0 157.0 158.0 159.0 160.0 161.0 162.0 163.0 164.0 165.0 166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 175.0 176.0 177.0 178.0 179.0 180.0 181.0 182.0 183.0 184.0 185.0 186.0 187.0 188.0 189.0 190.0 191.0 192.0 193.0 194.0 195.0 196.0 197.0 198.0 199.0 200.0 201.0 202.0 203.0 204.0 205.0 206.0 207.0 208.0 209.0 210.0 211.0 212.0 213.0 214.0 215.0 216.0 217.0 218.0 219.0 220.0 221.0 222.0 223.0 224.0 225.0 226.0 227.0 228.0 229.0 230.0 231.0 232.0 233.0 234.0 235.0 236.0 237.0 238.0 239.0 240.0 241.0 242.0 243.0 244.0 245.0 246.0 247.0 248.0 249.0 250.0 251.0 252.0 253.0 254.0 255.0 256.0 257.0 258.0 259.0 260.0 261.0 262.0 263.0 264.0 265.0 266.0 267.0 268.0 269.0 270.0 271.0 272.0 273.0 274.0 275.0 276.0 277.0 278.0 279.0 280.0 281.0 282.0 283.0 284.0 285.0 286.0 287.0 288.0 289.0 290.0 291.0 292.0 293.0 294.0 295.0 296.0 297.0 298.0 299.0 300.0 301.0 302.0 303.0 304.0 305.0 306.0 307.0 308.0 309.0 310.0 311.0 312.0 313.0 314.0 315.0 316.0 317.0 318.0 319.0 320.0 321.0 322.0 323.0 324.0 325.0 326.0 327.0 328.0 329.0 330.0 331.0 332.0 333.0 334.0 335.0 336.0 337.0 338.0 339.0 340.0 341.0 342.0 343.0 344.0 345.0 346.0 347.0 348.0 349.0 350.0 351.0 352.0 353.0 354.0 355.0 356.0 357.0 358.0 359.0 360.0 361.0 362.0 363.0 364.0 365.0 366.0 367.0 368.0 369.0 370.0 371.0 372.0 373.0 374.0 375.0 376.0 377.0 378.0 379.0 380.0 381.0 382.0 383.0 Ideal output data 0.0 -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 -25.0 -26.0 -27.0 -28.0 -29.0 -30.0 -31.0 -32.0 -33.0 -34.0 -35.0 -36.0 -37.0 -38.0 -39.0 -40.0 -41.0 -42.0 -43.0 -44.0 -45.0 -46.0 -47.0 -48.0 -49.0 -50.0 -51.0 -52.0 -53.0 -54.0 -55.0 -56.0 -57.0 -58.0 -59.0 -60.0 -61.0 -62.0 -63.0 -64.0 -65.0 -66.0 -67.0 -68.0 -69.0 -70.0 -71.0 -72.0 -73.0 -74.0 -75.0 -76.0 -77.0 -78.0 -79.0 -80.0 -81.0 -82.0 -83.0 -84.0 -85.0 -86.0 -87.0 -88.0 -89.0 -90.0 -91.0 -92.0 -93.0 -94.0 -95.0 -96.0 -97.0 -98.0 -99.0 -100.0 -101.0 -102.0 -103.0 -104.0 -105.0 -106.0 -107.0 -108.0 -109.0 -110.0 -111.0 -112.0 -113.0 -114.0 -115.0 -116.0 -117.0 -118.0 -119.0 -120.0 -121.0 -122.0 -123.0 -124.0 -125.0 -126.0 -127.0 -128.0 -129.0 -130.0 -131.0 -132.0 -133.0 -134.0 -135.0 -136.0 -137.0 -138.0 -139.0 -140.0 -141.0 -142.0 -143.0 -144.0 -145.0 -146.0 -147.0 -148.0 -149.0 -150.0 -151.0 -152.0 -153.0 -154.0 -155.0 -156.0 -157.0 -158.0 -159.0 -160.0 -161.0 -162.0 -163.0 -164.0 -165.0 -166.0 -167.0 -168.0 -169.0 -170.0 -171.0 -172.0 -173.0 -174.0 -175.0 -176.0 -177.0 -178.0 -179.0 -180.0 -181.0 -182.0 -183.0 -184.0 -185.0 -186.0 -187.0 -188.0 -189.0 -190.0 -191.0 -192.0 -193.0 -194.0 -195.0 -196.0 -197.0 -198.0 -199.0 -200.0 -201.0 -202.0 -203.0 -204.0 -205.0 -206.0 -207.0 -208.0 -209.0 -210.0 -211.0 -212.0 -213.0 -214.0 -215.0 -216.0 -217.0 -218.0 -219.0 -220.0 -221.0 -222.0 -223.0 -224.0 -225.0 -226.0 -227.0 -228.0 -229.0 -230.0 -231.0 -232.0 -233.0 -234.0 -235.0 -236.0 -237.0 -238.0 -239.0 -240.0 -241.0 -242.0 -243.0 -244.0 -245.0 -246.0 -247.0 -248.0 -249.0 -250.0 -251.0 -252.0 -253.0 -254.0 -255.0 -256.0 -257.0 -258.0 -259.0 -260.0 -261.0 -262.0 -263.0 -264.0 -265.0 -266.0 -267.0 -268.0 -269.0 -270.0 -271.0 -272.0 -273.0 -274.0 -275.0 -276.0 -277.0 -278.0 -279.0 -280.0 -281.0 -282.0 -283.0 -284.0 -285.0 -286.0 -287.0 -288.0 -289.0 -290.0 -291.0 -292.0 -293.0 -294.0 -295.0 -296.0 -297.0 -298.0 -299.0 -300.0 -301.0 -302.0 -303.0 -304.0 -305.0 -306.0 -307.0 -308.0 -309.0 -310.0 -311.0 -312.0 -313.0 -314.0 -315.0 -316.0 -317.0 -318.0 -319.0 -320.0 -321.0 -322.0 -323.0 -324.0 -325.0 -326.0 -327.0 -328.0 -329.0 -330.0 -331.0 -332.0 -333.0 -334.0 -335.0 -336.0 -337.0 -338.0 -339.0 -340.0 -341.0 -342.0 -343.0 -344.0 -345.0 -346.0 -347.0 -348.0 -349.0 -350.0 -351.0 -352.0 -353.0 -354.0 -355.0 -356.0 -357.0 -358.0 -359.0 -360.0 -361.0 -362.0 -363.0 -364.0 -365.0 -366.0 -367.0 -368.0 -369.0 -370.0 -371.0 -372.0 -373.0 -374.0 -375.0 -376.0 -377.0 -378.0 -379.0 -380.0 -381.0 -382.0 -383.0 Cubemap data of 64 * 64 * 1: Grid size is 8 x 8, each block has 8 x 8 threads. Time: 0.098 msec, 249.50 Mtexlookups/sec Comparing kernel output to expected data return 1 Actual output data -0.0 -1.0 -2.0 -3.0 -4.0 -5.0 -6.0 -7.0 -8.0 -9.0 -10.0 -11.0 -12.0 -13.0 -14.0 -15.0 -16.0 -17.0 -18.0 -19.0 -20.0 -21.0 -22.0 -23.0 -24.0 -25.0 -26.0 -27.0 -28.0 -29.0 -30.0 -31.0 -32.0 -33.0 -34.0 -35.0 -36.0 -37.0 -38.0 -39.0 -40.0 -41.0 -42.0 -43.0 -44.0 -45.0 -46.0 -47.0 -48.0 -49.0 -50.0 -51.0 -52.0 -53.0 -54.0 -55.0 -56.0 -57.0 -58.0 -59.0 -60.0 -61.0 -62.0 -63.0 -64.0 -65.0 -66.0 -67.0 -68.0 -69.0 -70.0 -71.0 -72.0 -73.0 -74.0 -75.0 -76.0 -77.0 -78.0 -79.0 -80.0 -81.0 -82.0 -83.0 -84.0 -85.0 -86.0 -87.0 -88.0 -89.0 -90.0 -91.0 -92.0 -93.0 -94.0 -95.0 -96.0 -97.0 -98.0 -99.0 -100.0 -101.0 -102.0 -103.0 -104.0 -105.0 -106.0 -107.0 -108.0 -109.0 -110.0 -111.0 -112.0 -113.0 -114.0 -115.0 -116.0 -117.0 -118.0 -119.0 -120.0 -121.0 -122.0 -123.0 -124.0 -125.0 -126.0 -127.0 -128.0 -129.0 -130.0 -131.0 -132.0 -133.0 -134.0 -135.0 -136.0 -137.0 -138.0 -139.0 -140.0 -141.0 -142.0 -143.0 -144.0 -145.0 -146.0 -147.0 -148.0 -149.0 -150.0 -151.0 -152.0 -153.0 -154.0 -155.0 -156.0 -157.0 -158.0 -159.0 -160.0 -161.0 -162.0 -163.0 -164.0 -165.0 -166.0 -167.0 -168.0 -169.0 -170.0 -171.0 -172.0 -173.0 -174.0 -175.0 -176.0 -177.0 -178.0 -179.0 -180.0 -181.0 -182.0 -183.0 -184.0 -185.0 -186.0 -187.0 -188.0 -189.0 -190.0 -191.0 -192.0 -193.0 -194.0 -195.0 -196.0 -197.0 -198.0 -199.0 -200.0 -201.0 -202.0 -203.0 -204.0 -205.0 -206.0 -207.0 -208.0 -209.0 -210.0 -211.0 -212.0 -213.0 -214.0 -215.0 -216.0 -217.0 -218.0 -219.0 -220.0 -221.0 -222.0 -223.0 -224.0 -225.0 -226.0 -227.0 -228.0 -229.0 -230.0 -231.0 -232.0 -233.0 -234.0 -235.0 -236.0 -237.0 -238.0 -239.0 -240.0 -241.0 -242.0 -243.0 -244.0 -245.0 -246.0 -247.0 -248.0 -249.0 -250.0 -251.0 -252.0 -253.0 -254.0 -255.0 -256.0 -257.0 -258.0 -259.0 -260.0 -261.0 -262.0 -263.0 -264.0 -265.0 -266.0 -267.0 -268.0 -269.0 -270.0 -271.0 -272.0 -273.0 -274.0 -275.0 -276.0 -277.0 -278.0 -279.0 -280.0 -281.0 -282.0 -283.0 -284.0 -285.0 -286.0 -287.0 -288.0 -289.0 -290.0 -291.0 -292.0 -293.0 -294.0 -295.0 -296.0 -297.0 -298.0 -299.0 -300.0 -301.0 -302.0 -303.0 -304.0 -305.0 -306.0 -307.0 -308.0 -309.0 -310.0 -311.0 -312.0 -313.0 -314.0 -315.0 -316.0 -317.0 -318.0 -319.0 -320.0 -321.0 -322.0 -323.0 -324.0 -325.0 -326.0 -327.0 -328.0 -329.0 -330.0 -331.0 -332.0 -333.0 -334.0 -335.0 -336.0 -337.0 -338.0 -339.0 -340.0 -341.0 -342.0 -343.0 -344.0 -345.0 -346.0 -347.0 -348.0 -349.0 -350.0 -351.0 -352.0 -353.0 -354.0 -355.0 -356.0 -357.0 -358.0 -359.0 -360.0 -361.0 -362.0 -363.0 -364.0 -365.0 -366.0 -367.0 -368.0 -369.0 -370.0 -371.0 -372.0 -373.0 -374.0 -375.0 -376.0 -377.0 -378.0 -379.0 -380.0 -381.0 -382.0 -383.0
▶ 涨姿势
● helper_time.h 中新定义的计时函数
1 // 关键步骤 2 StopWatchInterface *timer = NULL; 3 sdkCreateTimer(&timer); 4 sdkStartTimer(&timer); 5 6 sdkStopTimer(&timer); 7 sdkGetTimerValue(&timer); 8 sdkDeleteTimer(&timer); 9 10 // helper_time.h 11 class StopWatchInterface 12 { 13 public: 14 StopWatchInterface() {}; 15 virtual ~StopWatchInterface() {}; 16 17 public: 18 virtual void start() = 0; 19 virtual void stop() = 0; 20 virtual void reset() = 0; 21 virtual float getTime() = 0;// 获取计时(计时器不停) 22 virtual float getAverageTime() = 0; 23 }; 24 25 inline bool sdkCreateTimer(StopWatchInterface **timer_interface) 26 { 27 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 28 *timer_interface = (StopWatchInterface *)new StopWatchWin(); 29 #else 30 *timer_interface = (StopWatchInterface *)new StopWatchLinux(); 31 #endif 32 return (*timer_interface != NULL) ? true : false; 33 } 34 35 inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) 36 { 37 if (*timer_interface) 38 { 39 delete *timer_interface; 40 *timer_interface = NULL; 41 } 42 return true; 43 } 44 45 inline bool sdkStartTimer(StopWatchInterface **timer_interface) 46 { 47 if (*timer_interface) 48 (*timer_interface)->start(); 49 return true; 50 } 51 52 inline bool sdkStopTimer(StopWatchInterface **timer_interface) 53 { 54 if (*timer_interface) 55 (*timer_interface)->stop(); 56 return true; 57 } 58 59 inline float sdkGetTimerValue(StopWatchInterface **timer_interface) 60 { 61 if (*timer_interface) 62 return (*timer_interface)->getTime(); 63 else 64 return 0.0f; 65 }
● 立方体纹理贴图。六个面分别为 x = 1 正面、x = -1 轴负面、y = 1 正面、y = -1 负面、z = 1 正面、x = -1 负面,对应前、后、右、左、上、下。按照线性下标 [0, width * width * 6 - 1] 顺序访问时,各元素存储位置如下图所示(width == 2 为例)。