使用表面写入函数,结合纹理引用实现图片的旋转
▶ 源代码
1 #include <stdio.h> 2 #include <windows.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 8 #define WINDOWS_LEAN_AND_MEAN 9 #define NOMINMAX 10 #define MIN_EPSILON_ERROR 5e-3f 11 float angle = 0.5f; // 弧度制 12 texture<float, 2, cudaReadModeElementType> tex; 13 surface<void, 2> outputSurface; 14 15 // 使用表面写入,将全局内存中的数据 d_data 写到绑定了纹理引用的 CUDA 数组 cuArray 中 16 __global__ void surfaceWriteKernel(float *gIData, int width, int height) 17 { 18 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 19 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 20 21 surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap); 22 } 23 24 // 利用纹理取样,将绑定了纹理引用的 CUDA 数组 cuArray 中的图片进行旋转,写入全局内存 d_data 中 25 __global__ void transformKernel(float *gOData,int width,int height,float theta) 26 { 27 unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; 28 unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; 29 float u = x / (float)width - 0.5f; 30 float v = y / (float)height - 0.5f; 31 32 gOData[y * width + x] = tex2D(tex, u * cosf(theta) - v * sinf(theta) + 0.5f, v * cosf(theta) + u * sinf(theta) + 0.5f); 33 } 34 35 int main() 36 { 37 printf("\n\tStart.\n"); 38 cudaSetDevice(0);// 删掉了筛选设备的过程 39 cudaDeviceProp deviceProps; 40 cudaGetDeviceProperties(&deviceProps, 0); 41 printf("\n\tDevice %s, Multi-Processors: %d, SM %d.%d\n", deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major, deviceProps.minor); 42 43 // 读取图片数据 44 float *h_data = NULL, *h_dataRef = NULL; 45 unsigned int width, height, size; 46 sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\lena_bw.pgm", &h_data, &width, &height);// 删掉了用函数 sdkFindFilePath() 查找输入文件的过程 47 size = width * height * sizeof(float); 48 sdkLoadPGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\ref_rotated.pgm", &h_dataRef, &width, &height); 49 printf("\n\tLoad input files, %d x %d pixels\n", width, height); 50 51 // 申请设备内存 52 float *d_data = NULL; 53 cudaMalloc((void **) &d_data, size); 54 cudaArray *cuArray; 55 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 56 cudaMallocArray(&cuArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); 57 cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice); 58 //cudaMemcpyToArray(cuArray,0,0,h_data,size,cudaMemcpyHostToDevice); 只使用纹理内存时,可以直接拷贝到cuArray中 59 60 // 绑定表面引用 61 cudaBindSurfaceToArray(outputSurface, cuArray, channelDesc); 62 63 // 使用表面写入 64 dim3 dimBlock(8, 8, 1); 65 dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); 66 surfaceWriteKernel<<<dimGrid, dimBlock>>>(d_data, width, height); 67 68 // 绑定纹理引用 69 tex.addressMode[0] = cudaAddressModeWrap; 70 tex.addressMode[1] = cudaAddressModeWrap; 71 tex.filterMode = cudaFilterModeLinear; 72 tex.normalized = true; 73 cudaBindTextureToArray(tex, cuArray, channelDesc); 74 75 // 预跑 76 transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, angle); 77 cudaDeviceSynchronize(); 78 79 StopWatchInterface *timer = NULL; 80 sdkCreateTimer(&timer); 81 sdkStartTimer(&timer); 82 83 transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, angle); 84 85 cudaDeviceSynchronize(); 86 sdkStopTimer(&timer); 87 sdkDeleteTimer(&timer); 88 printf("\n\tCost time: %f ms, %.2f Mpixels/sec\n", sdkGetTimerValue(&timer), (width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); 89 90 // 结果回收、输出和检验 91 cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); 92 sdkSavePGM("D:\\Code\\CUDA\\cudaProjectTemp\\data\\output.pgm", h_data, width, height); 93 printf("\n\tSave output file.\n"); 94 printf("\n\tFinish, return %s.\n", compareData(h_data, h_dataRef, width * height, MIN_EPSILON_ERROR, 0.0f) ? "Passed" : "Failed"); 95 96 cudaFree(d_data); 97 cudaFreeArray(cuArray); 98 getchar(); 99 return 0; 100 }
▶ 输出结果
1 Start. 2 3 Device GeForce GTX 1070, Multi-Processors: 16, SM 6.1 4 5 Load input files, 512 x 512 pixels 6 7 Cost time: 0.000000 ms, inf Mpixels/sec 8 9 Save output file. 10 11 Finish, return Passed
▶ 涨姿势
● 使用函数 sdkLoadPGM() 读取图片数据
1 // helper_image.h 2 inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, unsigned int *h, unsigned int *channels) 3 { 4 FILE *fp = NULL; 5 if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) 6 { 7 std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; 8 return false; 9 } 10 11 // check header 12 char header[PGMHeaderSize]; 13 if (fgets(header, PGMHeaderSize, fp) == NULL) 14 { 15 std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; 16 return false; 17 } 18 if (strncmp(header, "P5", 2) == 0) 19 { 20 *channels = 1; 21 } 22 else if (strncmp(header, "P6", 2) == 0) 23 { 24 *channels = 3; 25 } 26 else 27 { 28 std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; 29 *channels = 0; 30 return false; 31 } 32 33 // parse header, read maxval, width and height 34 unsigned int width = 0; 35 unsigned int height = 0; 36 unsigned int maxval = 0; 37 unsigned int i = 0; 38 while (i < 3) 39 { 40 if (fgets(header, PGMHeaderSize, fp) == NULL) 41 { 42 std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; 43 return false; 44 } 45 46 if (header[0] == '#') 47 { 48 continue; 49 } 50 51 if (i == 0) 52 { 53 i += SSCANF(header, "%u %u %u", &width, &height, &maxval); 54 } 55 else if (i == 1) 56 { 57 i += SSCANF(header, "%u %u", &height, &maxval); 58 } 59 else if (i == 2) 60 { 61 i += SSCANF(header, "%u", &maxval); 62 } 63 } 64 65 // check if given handle for the data is initialized 66 if (NULL != *data) 67 { 68 if (*w != width || *h != height) 69 { 70 std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; 71 } 72 } 73 else 74 { 75 *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height **channels); 76 *w = width; 77 *h = height; 78 } 79 80 // read and close file 81 if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0) 82 { 83 std::cerr << "__LoadPPM() read data returned error." << std::endl; 84 } 85 86 fclose(fp); 87 return true; 88 } 89 90 template <class T> inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h) 91 { 92 unsigned char *idata = NULL; 93 unsigned int channels; 94 95 if (!__loadPPM(file, &idata, w, h, &channels)) 96 return false; 97 unsigned int size = *w **h * channels; 98 99 if (*data == NULL)// 如果 T **data 没有初始化,则按照读取的 size 进行初始化 100 *data = (T *)malloc(sizeof(T) * size); 101 102 std::transform(idata, idata + size, *data, ConverterFromUByte<T>());// 拷贝数据到 data 中 103 104 free(idata); 105 return true; 106 }
● 使用到的表面写入函数原型
1 // surface_functions.h 2 template<class T> static __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) 3 { 4 #ifdef __CUDA_ARCH__ 5 __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, mode); 6 #endif 7 }