爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

使用纹理引用来旋转图片,并在使用了静态编译和运行时编译两种环境。

▶ 源代码:静态编译

 1 #include <stdio.h>
 2 #include <windows.h>
 3 #include <cuda_runtime.h>
 4 #include "device_launch_parameters.h"
 5 #include <helper_functions.h>
 6 #include <helper_cuda.h>
 7 
 8 #define MAX_EPSILON_ERROR 5e-3f
 9 const float angle = 0.5f;
10 texture<float, 2, cudaReadModeElementType> tex;
11 
12 __global__ void transformKernel(float *outputData, int width, int height, float theta)
13 {
14     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
15     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 
16     float u = x / (float)width - 0.5f;
17     float v = y / (float)height - 0.5f;
18 
19     outputData[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
20 }
21 
22 int main()
23 {
24     printf("\n\tStart.\n");
25 
26     // 读取图片数据
27     float *h_data = NULL, *h_dataRef = NULL;
28     unsigned int width, height, size;
29     sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
30     size = width * height * sizeof(float);
31     sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\ref_rotated.pgm", &h_dataRef, &width, &height);
32     printf("\n\tLoad input files, %d x %d pixels\n", width, height);
33 
34     // 申请设备内存
35     float *d_data = NULL;
36     cudaMalloc((void **)&d_data, size);
37     cudaArray *cuArray;
38     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
39     cudaMallocArray(&cuArray, &channelDesc, width, height); 
40     cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);// 与 simpleSurfaceWrite 中不同,直接拷贝进 cuArray 
41 
42     // 绑定纹理引用
43     tex.addressMode[0] = cudaAddressModeWrap;
44     tex.addressMode[1] = cudaAddressModeWrap;
45     tex.filterMode = cudaFilterModeLinear;
46     tex.normalized = true;
47     cudaBindTextureToArray(tex, cuArray, channelDesc);
48 
49     // 预跑
50     dim3 dimBlock(8, 8, 1);
51     dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
52     transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle);
53     cudaDeviceSynchronize();
54 
55     StopWatchInterface *timer = NULL;
56     sdkCreateTimer(&timer);
57     sdkStartTimer(&timer);
58 
59     transformKernel << <dimGrid, dimBlock, 0 >> >(d_data, width, height, angle);
60     cudaDeviceSynchronize();
61     
62     sdkStopTimer(&timer);
63     printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
64     sdkDeleteTimer(&timer);
65     
66     // 结果回收、输出和检验
67     cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
68     sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);
69     printf("\n\tSave output file.\n");
70     printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed");
71 
72     cudaFree(d_data);
73     cudaFreeArray(cuArray);
74     getchar();
75     return 0;
76 }

▶ 输出结果

    Start.

    Load input files, 512 x 512 pixels

    Cost time: 0.362788 ms, 722.58 Mpixels/sec

    Save output file.

    Finish, return Passed.

▶ 源代码:运行时编译

 1 // simpleTexture_kernel.cu
 2 #ifndef _SIMPLETEXTURE_KERNEL_H_
 3 #define _SIMPLETEXTURE_KERNEL_H_
 4 
 5 texture<float, 2, cudaReadModeElementType> tex;
 6 
 7 extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta)
 8 {
 9     unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
10     unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
11     float u = x / (float)width - 0.5f;
12     float v = y / (float)height - 0.5f;
13 
14     g_odata[y*width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f);
15 }
16 
17 #endif
  1 // simpleTextureDrv.cpp
  2 #include <stdio.h>
  3 #include <iostream>
  4 #include <helper_functions.h>
  5 #include <cuda.h>
  6 
  7 #define MAX_EPSILON_ERROR 5e-3f
  8 #define PATH "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTextureDrv\\data\\"
  9 using namespace std;
 10 float angle = 0.5f;
 11 CUmodule cuModule;
 12 CUcontext cuContext;
 13 
 14 CUfunction initCUDA()
 15 {
 16     CUfunction cuFunction = 0;
 17     string module_path, ptx_source;
 18     cuInit(0);                      // 初始化设备,类似于 runtime 中的函数 cudaSetDevice()
 19     cuCtxCreate(&cuContext, 0, 0);  // 创建上下文,后两个参数分别是标志参数和设备号
 20 
 21     // 读取 .ptx 文件
 22     module_path = PATH"simpleTexture_kernel64.ptx";
 23     FILE *fp = fopen(module_path.c_str(), "rb");
 24     fseek(fp, 0, SEEK_END);
 25     int file_size = ftell(fp);
 26     char *buf = new char[file_size + 1];
 27     fseek(fp, 0, SEEK_SET);
 28     fread(buf, sizeof(char), file_size, fp);
 29     fclose(fp);
 30     buf[file_size] = '\0';
 31     ptx_source = buf;
 32     delete[] buf;
 33 
 34     if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
 35     {
 36         // 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
 37         const unsigned int jitNumOptions = 3;
 38         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 39         void **jitOptVals = new void *[jitNumOptions];
 40 
 41         // 编译日志长度
 42         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 43         int jitLogBufferSize = 1024;
 44         jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
 45 
 46         // 编译日志内容
 47         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
 48         char *jitLogBuffer = new char[jitLogBufferSize];
 49         jitOptVals[1] = jitLogBuffer;
 50 
 51         // 设定一个内核使用的寄存器数量
 52         jitOptions[2] = CU_JIT_MAX_REGISTERS;
 53         int jitRegCount = 32;
 54         jitOptVals[2] = (void *)(size_t)jitRegCount;
 55 
 56         // 编译模块
 57         cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
 58         //printf("\n\tPTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志
 59     }
 60     else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
 61         cuModuleLoad(&cuModule, module_path.c_str());
 62     
 63     // 取出编译好的模块中的函数
 64     cuModuleGetFunction(&cuFunction, cuModule, "transformKernel");
 65     return cuFunction;// 删掉了错误检查,如果中间某一步出错,则应该先销毁上下文再退出
 66 }
 67 
 68 int main()
 69 {
 70     printf("\n\tStart.\n"); 
 71 
 72     // 初始化设备,编译 PTX
 73     CUfunction transform = initCUDA();
 74 
 75     // 读取图片数据
 76     float *h_data = NULL, *h_dataRef = NULL;
 77     unsigned int width, height, size;
 78     sdkLoadPGM(PATH"lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程
 79     size = width * height * sizeof(float);
 80     sdkLoadPGM(PATH"ref_rotated.pgm", &h_dataRef, &width, &height);
 81     printf("\n\tLoad input files, %d x %d pixels\n", width, height);
 82 
 83     // 申请设备内存
 84     CUdeviceptr d_data = (CUdeviceptr)NULL;
 85     cuMemAlloc(&d_data, size);
 86     CUarray cu_array;
 87     CUDA_ARRAY_DESCRIPTOR desc;
 88     desc.Format = CU_AD_FORMAT_FLOAT;
 89     desc.NumChannels = 1;
 90     desc.Width = width;
 91     desc.Height = height;
 92     cuArrayCreate(&cu_array, &desc);
 93     CUDA_MEMCPY2D copyParam;
 94     memset(&copyParam, 0, sizeof(copyParam));
 95     copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
 96     copyParam.dstArray = cu_array;
 97     copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
 98     copyParam.srcHost = h_data;
 99     copyParam.srcPitch = width * sizeof(float);
100     copyParam.WidthInBytes = copyParam.srcPitch;
101     copyParam.Height = height;
102     cuMemcpy2D(&copyParam);
103 
104     // 绑定纹理引用
105     CUtexref cu_texref;
106     cuModuleGetTexRef(&cu_texref, cuModule, "tex");
107     cuTexRefSetArray(cu_texref, cu_array, CU_TRSA_OVERRIDE_FORMAT);
108     cuTexRefSetAddressMode(cu_texref, 0, CU_TR_ADDRESS_MODE_WRAP);
109     cuTexRefSetAddressMode(cu_texref, 1, CU_TR_ADDRESS_MODE_WRAP);
110     cuTexRefSetFilterMode(cu_texref, CU_TR_FILTER_MODE_LINEAR);
111     cuTexRefSetFlags(cu_texref, CU_TRSF_NORMALIZED_COORDINATES);
112     cuTexRefSetFormat(cu_texref, CU_AD_FORMAT_FLOAT, 1);
113     cuParamSetTexRef(transform, CU_PARAM_TR_DEFAULT, cu_texref);
114 
115     int block_size = 8;
116     StopWatchInterface *timer = NULL;
117 
118     // 两种调用 Driver API 的方式
119     if (1)
120     {
121         void *args[5] = {&d_data, &width, &height, &angle};
122         // 预跑
123         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL);
124         cuCtxSynchronize();
125         // 再跑一次测试性能
126         sdkCreateTimer(&timer);
127         sdkStartTimer(&timer);        
128         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL);
129     }
130     else
131     {
132         int offset = 0;
133         char argBuffer[256];
134         // 在一个 CUdeviceptr(unsigned long long)长度的空间里写入调用参数
135         *((CUdeviceptr *)&argBuffer[offset]) = d_data;
136         offset += sizeof(d_data);
137         *((unsigned int *)&argBuffer[offset]) = width;
138         offset += sizeof(width);
139         *((unsigned int *)&argBuffer[offset]) = height;
140         offset += sizeof(height);
141         *((float *)&argBuffer[offset]) = angle;
142         offset += sizeof(angle);
143         void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END };
144         // 预跑
145         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,block_size, block_size, 1,0,NULL, NULL, (void **)&kernel_launch_config);
146         cuCtxSynchronize();
147         // 再跑一次测试性能
148         sdkCreateTimer(&timer);
149         sdkStartTimer(&timer);        
150         cuLaunchKernel(transform, (width / block_size), (height / block_size), 1,block_size, block_size, 1,0, 0,NULL, (void **)&kernel_launch_config);
151     }
152     cuCtxSynchronize();
153     sdkStopTimer(&timer);
154     printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
155     sdkDeleteTimer(&timer);
156 
157     // 结果回收、输出和检验
158     cuMemcpyDtoH(h_data, d_data, size);
159     sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height);
160     printf("\n\tSave output file.\n");
161     printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MAX_EPSILON_ERROR, 0.15f) ? "Passed" : "Failed");
162 
163     cuMemFree(d_data);
164     cuArrayDestroy(cu_array);
165     cuCtxDestroy(cuContext);
166     getchar();
167     return 0;
168 }

▶ 输出结果:

    Start.

    Load input files, 512 x 512 pixels

    Cost time: 0.355230 ms, 737.96 Mpixels/sec

    Save output file.

    Finish, return Passed.

 

▶ 涨姿势

● 一般,与 0_Simple__simpleSurfaceWrite 类似。

 

posted on 2017-12-03 01:16  爨爨爨好  阅读(437)  评论(0编辑  收藏  举报