使用纹理引用来旋转图片,并在使用了静态编译和运行时编译两种环境。
▶ 源代码:静态编译
1 #include <stdio.h> 2 #include <windows.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 8 #define MAX_EPSILON_ERROR 5e-3f 9 const float angle = 0.5f; 10 texture<float, 2, cudaReadModeElementType> tex; 11 12 __global__ void transformKernel(float *outputData, int width, int height, float theta) 13 { 14 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 15 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 16 float u = x / (float)width - 0.5f; 17 float v = y / (float)height - 0.5f; 18 19 outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f); 20 } 21 22 int main() 23 { 24 printf("\n\tStart.\n"); 25 26 // 读取图片数据 27 float *h_data = NULL, *h_dataRef = NULL; 28 unsigned int width, height, size; 29 sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程 30 size = width * height * sizeof(float); 31 sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\ref_rotated.pgm", &h_dataRef, &width, &height); 32 printf("\n\tLoad input files, %d x %d pixels\n", width, height); 33 34 // 申请设备内存 35 float *d_data = NULL; 36 cudaMalloc((void **)&d_data, size); 37 cudaArray *cuArray; 38 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 39 cudaMallocArray(&cuArray, &channelDesc, width, height); 40 cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同,直接拷贝进 cuArray 41 42 // 绑定纹理引用 43 tex.addressMode[0] = cudaAddressModeWrap; 44 tex.addressMode[1] = cudaAddressModeWrap; 45 tex.filterMode = cudaFilterModeLinear; 46 tex.normalized = true; 47 cudaBindTextureToArray(tex, cuArray, channelDesc); 48 49 // 预跑 50 dim3 dimBlock(8, 8, 1); 51 dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); 52 transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle); 53 cudaDeviceSynchronize(); 54 55 StopWatchInterface *timer = NULL; 56 sdkCreateTimer(&timer); 57 sdkStartTimer(&timer); 58 59 transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle); 60 cudaDeviceSynchronize(); 61 62 sdkStopTimer(&timer); 63 printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); 64 sdkDeleteTimer(&timer); 65 66 // 结果回收、输出和检验 67 cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); 68 sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height); 69 printf("\n\tSave output file.\n"); 70 printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed"); 71 72 cudaFree(d_data); 73 cudaFreeArray(cuArray); 74 getchar(); 75 return 0; 76 }
▶ 输出结果
Start. Load input files, 512 x 512 pixels Cost time: 0.362788 ms, 722.58 Mpixels/sec Save output file. Finish, return Passed.
▶ 源代码:运行时编译
1 // simpleTexture_kernel.cu 2 #ifndef _SIMPLETEXTURE_KERNEL_H_ 3 #define _SIMPLETEXTURE_KERNEL_H_ 4 5 texture<float, 2, cudaReadModeElementType> tex; 6 7 extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta) 8 { 9 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 10 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 11 float u = x / (float)width - 0.5f; 12 float v = y / (float)height - 0.5f; 13 14 g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f); 15 } 16 17 #endif
1 // simpleTextureDrv.cpp 2 #include <stdio.h> 3 #include <iostream> 4 #include <helper_functions.h> 5 #include <cuda.h> 6 7 #define MAX_EPSILON_ERROR 5e-3f 8 #define PATH "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTextureDrv\\data\\" 9 using namespace std; 10 float angle = 0.5f; 11 CUmodule cuModule; 12 CUcontext cuContext; 13 14 CUfunction initCUDA() 15 { 16 CUfunction cuFunction = 0; 17 string module_path, ptx_source; 18 cuInit(0); // 初始化设备,类似于 runtime 中的函数 cudaSetDevice() 19 cuCtxCreate(&cuContext, 0, 0); // 创建上下文,后两个参数分别是标志参数和设备号 20 21 // 读取 .ptx 文件 22 module_path = PATH"simpleTexture_kernel64.ptx"; 23 FILE *fp = fopen(module_path.c_str(), "rb"); 24 fseek(fp, 0, SEEK_END); 25 int file_size = ftell(fp); 26 char *buf = new char[file_size + 1]; 27 fseek(fp, 0, SEEK_SET); 28 fread(buf, sizeof(char), file_size, fp); 29 fclose(fp); 30 buf[file_size] = '\0'; 31 ptx_source = buf; 32 delete[] buf; 33 34 if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译 35 { 36 // 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值 37 const unsigned int jitNumOptions = 3; 38 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; 39 void **jitOptVals = new void *[jitNumOptions]; 40 41 // 编译日志长度 42 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 43 int jitLogBufferSize = 1024; 44 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; 45 46 // 编译日志内容 47 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 48 char *jitLogBuffer = new char[jitLogBufferSize]; 49 jitOptVals[1] = jitLogBuffer; 50 51 // 设定一个内核使用的寄存器数量 52 jitOptions[2] = CU_JIT_MAX_REGISTERS; 53 int jitRegCount = 32; 54 jitOptVals[2] = (void *)(size_t)jitRegCount; 55 56 // 编译模块 57 cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals); 58 //printf("\n\tPTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志 59 } 60 else// 使用的是 .cubin,不用编译(本例中不经过这个分支) 61 cuModuleLoad(&cuModule, module_path.c_str()); 62 63 // 取出编译好的模块中的函数 64 cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"); 65 return cuFunction;// 删掉了错误检查,如果中间某一步出错,则应该先销毁上下文再退出 66 } 67 68 int main() 69 { 70 printf("\n\tStart.\n"); 71 72 // 初始化设备,编译 PTX 73 CUfunction transform = initCUDA(); 74 75 // 读取图片数据 76 float *h_data = NULL, *h_dataRef = NULL; 77 unsigned int width, height, size; 78 sdkLoadPGM(PATH"lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程 79 size = width * height * sizeof(float); 80 sdkLoadPGM(PATH"ref_rotated.pgm", &h_dataRef, &width, &height); 81 printf("\n\tLoad input files, %d x %d pixels\n", width, height); 82 83 // 申请设备内存 84 CUdeviceptr d_data = (CUdeviceptr)NULL; 85 cuMemAlloc(&d_data, size); 86 CUarray cu_array; 87 CUDA_ARRAY_DESCRIPTOR desc; 88 desc.Format = CU_AD_FORMAT_FLOAT; 89 desc.NumChannels = 1; 90 desc.Width = width; 91 desc.Height = height; 92 cuArrayCreate(&cu_array, &desc); 93 CUDA_MEMCPY2D copyParam; 94 memset(©Param, 0, sizeof(copyParam)); 95 copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; 96 copyParam.dstArray = cu_array; 97 copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; 98 copyParam.srcHost = h_data; 99 copyParam.srcPitch = width * sizeof(float); 100 copyParam.WidthInBytes = copyParam.srcPitch; 101 copyParam.Height = height; 102 cuMemcpy2D(©Param); 103 104 // 绑定纹理引用 105 CUtexref cu_texref; 106 cuModuleGetTexRef(&cu_texref, cuModule, "tex"); 107 cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT); 108 cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP); 109 cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP); 110 cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR); 111 cuTexRefSetFlags(cu_texref, CU_TRSF_NORMALIZED_COORDINATES); 112 cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, 1); 113 cuParamSetTexRef(transform, CU_PARAM_TR_DEFAULT, cu_texref); 114 115 int block_size = 8; 116 StopWatchInterface *timer = NULL; 117 118 // 两种调用 Driver API 的方式 119 if (1) 120 { 121 void *args[5] = {&d_data, &width, &height, &angle}; 122 // 预跑 123 cuLaunchKernel(transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL); 124 cuCtxSynchronize(); 125 // 再跑一次测试性能 126 sdkCreateTimer(&timer); 127 sdkStartTimer(&timer); 128 cuLaunchKernel(transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL); 129 } 130 else 131 { 132 int offset = 0; 133 char argBuffer[256]; 134 // 在一个 CUdeviceptr(unsigned long long)长度的空间里写入调用参数 135 *((CUdeviceptr *)&argBuffer[offset]) = d_data; 136 offset += sizeof(d_data); 137 *((unsigned int *)&argBuffer[offset]) = width; 138 offset += sizeof(width); 139 *((unsigned int *)&argBuffer[offset]) = height; 140 offset += sizeof(height); 141 *((float *)&argBuffer[offset]) = angle; 142 offset += sizeof(angle); 143 void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END }; 144 // 预跑 145 cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,block_size, block_size, 1,0,NULL, NULL, (void **)&kernel_launch_config); 146 cuCtxSynchronize(); 147 // 再跑一次测试性能 148 sdkCreateTimer(&timer); 149 sdkStartTimer(&timer); 150 cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,block_size, block_size, 1,0, 0,NULL, (void **)&kernel_launch_config); 151 } 152 cuCtxSynchronize(); 153 sdkStopTimer(&timer); 154 printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); 155 sdkDeleteTimer(&timer); 156 157 // 结果回收、输出和检验 158 cuMemcpyDtoH(h_data, d_data, size); 159 sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height); 160 printf("\n\tSave output file.\n"); 161 printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.15f) ? "Passed" : "Failed"); 162 163 cuMemFree(d_data); 164 cuArrayDestroy(cu_array); 165 cuCtxDestroy(cuContext); 166 getchar(); 167 return 0; 168 }
▶ 输出结果:
Start. Load input files, 512 x 512 pixels Cost time: 0.355230 ms, 737.96 Mpixels/sec Save output file. Finish, return Passed.
▶ 涨姿势
● 一般,与 0_Simple__simpleSurfaceWrite 类似。