使用 C++ 的模板
▶ 源代码:静态使用
1 // sharedmem.cuh 2 #ifndef _SHAREDMEM_H_ 3 #define _SHAREDMEM_H_ 4 5 // SharedMemory 的封装 6 template <typename T> struct SharedMemory 7 { 8 __device__ T *getPointer() 9 { 10 extern __device__ void error(void); 11 error(); 12 return NULL; 13 } 14 }; 15 16 // SharedMemory 的各种数据类型的实现 17 template <> struct SharedMemory <int> 18 { 19 __device__ int *getPointer() 20 { 21 extern __shared__ int s_int[]; 22 return s_int; 23 } 24 }; 25 26 template <> struct SharedMemory <unsigned int> 27 { 28 __device__ unsigned int *getPointer() 29 { 30 extern __shared__ unsigned int s_uint[]; 31 return s_uint; 32 } 33 }; 34 35 template <> struct SharedMemory <char> 36 { 37 __device__ char *getPointer() 38 { 39 extern __shared__ char s_char[]; 40 return s_char; 41 } 42 }; 43 44 template <> struct SharedMemory <unsigned char> 45 { 46 __device__ unsigned char *getPointer() 47 { 48 extern __shared__ unsigned char s_uchar[]; 49 return s_uchar; 50 } 51 }; 52 53 template <> struct SharedMemory <short> 54 { 55 __device__ short *getPointer() 56 { 57 extern __shared__ short s_short[]; 58 return s_short; 59 } 60 }; 61 62 template <> struct SharedMemory <unsigned short> 63 { 64 __device__ unsigned short *getPointer() 65 { 66 extern __shared__ unsigned short s_ushort[]; 67 return s_ushort; 68 } 69 }; 70 71 template <> struct SharedMemory <long> 72 { 73 __device__ long *getPointer() 74 { 75 extern __shared__ long s_long[]; 76 return s_long; 77 } 78 }; 79 80 template <> struct SharedMemory <unsigned long> 81 { 82 __device__ unsigned long *getPointer() 83 { 84 extern __shared__ unsigned long s_ulong[]; 85 return s_ulong; 86 } 87 }; 88 89 template <> struct SharedMemory <bool> 90 { 91 __device__ bool *getPointer() 92 { 93 extern __shared__ bool s_bool[]; 94 return s_bool; 95 } 96 }; 97 98 template <> struct SharedMemory <float> 99 { 100 __device__ float *getPointer() 101 { 102 extern __shared__ float s_float[]; 103 return s_float; 104 } 105 }; 106 107 template <> struct SharedMemory <double> 108 { 109 __device__ double *getPointer() 110 { 111 extern __shared__ double s_double[]; 112 return s_double; 113 } 114 }; 115 116 #endif
1 // simpleTemplates.cu 2 #include <stdio.h> 3 #include <timer.h> 4 #include <cuda_runtime.h> 5 #include "device_launch_parameters.h" 6 #include <helper_functions.h> 7 #include <helper_cuda.h> 8 #include "sharedmem.cuh" 9 10 template<class T> __global__ void testKernel(T *g_idata, T *g_odata) 11 { 12 SharedMemory<T> smem; 13 T *sdata = smem.getPointer(); 14 // 以上两行结合,等效于 extern __shared__ T sdata[]; 15 const unsigned int tid = threadIdx.x; 16 17 sdata[tid] = g_idata[tid]; 18 __syncthreads(); 19 sdata[tid] = (T) blockDim.x * sdata[tid]; 20 __syncthreads(); 21 g_odata[tid] = sdata[tid]; 22 } 23 24 template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据 25 { 26 const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定 27 for (unsigned int i = 0; i < len; ++i) 28 reference[i] = idata[i] * T_len; 29 } 30 31 // ArrayComparator 的封装 32 template<class T> class ArrayComparator 33 { 34 public: 35 bool compare(const T *reference, T *data, unsigned int len) 36 { 37 fprintf(stderr, "Error: no comparison function implemented for this type\n"); 38 return false; 39 } 40 }; 41 // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h 42 template<> class ArrayComparator<int> 43 { 44 public: 45 bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); } 46 }; 47 48 template<> class ArrayComparator<float> 49 { 50 public: 51 bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); } 52 }; 53 54 // ArrayFileWriter 的封装 55 template<class T> class ArrayFileWriter 56 { 57 public: 58 bool write(const char *filename, T *data, unsigned int len, float epsilon) 59 { 60 fprintf(stderr, "Error: no file write function implemented for this type\n"); 61 return false; 62 } 63 }; 64 // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h 65 template<> class ArrayFileWriter<int> 66 { 67 public: 68 bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); } 69 }; 70 71 template<> class ArrayFileWriter<float> 72 { 73 public: 74 bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); } 75 }; 76 77 template<class T> bool test(int len) 78 { 79 unsigned int mem_size = sizeof(T) * len; 80 dim3 grid(1, 1, 1); 81 dim3 threads(len, 1, 1); 82 ArrayComparator<T> comparator; 83 ArrayFileWriter<T> writer; 84 cudaSetDevice(0); 85 StartTimer(); 86 87 // 申请内存 88 T *h_idata, *h_odata, *d_idata, *d_odata; 89 h_idata = (T *)malloc(mem_size); 90 h_odata = (T *)malloc(mem_size); 91 cudaMalloc((void **)&d_idata, mem_size); 92 cudaMalloc((void **)&d_odata, mem_size); 93 for (unsigned int i = 0; i < len; ++i) 94 h_idata[i] = (T) i; 95 cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice); 96 97 // 计算和计时 98 testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata); 99 cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost); 100 printf("\n\tProcessing time: %f ms\n", GetTimer()); 101 102 // 检查结果 103 computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据 104 bool result = comparator.compare(h_idata, h_odata, len); 105 //writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分 106 107 free(h_idata); 108 free(h_odata); 109 cudaFree(d_idata); 110 cudaFree(d_odata); 111 return result; 112 } 113 114 int main() 115 { 116 printf("\n\tStart.\n"); 117 printf("\n\t> test<float, 32>, result: %s.\n", test<float>(32) ? "Passed" : "Failed"); 118 printf("\n\t> test<float, 64>, result: %s.\n", test<float>(64) ? "Passed" : "Failed"); 119 120 getchar(); 121 return 0; 122 }
▶ 输出结果:
Start. Processing time: 107.394216 ms > test<float, 32>, result: Passed. Processing time: 3.153182 ms > test<float, 64>, result: Passed.
▶ 源代码:使用运行时编译
1 // sharedmem.cuh,与静态完全相同
1 // simpleTemplates_kernel.cu 2 #include "sharedmem.cuh" 3 4 template<class T> __global__ void testKernel(T *g_idata, T *g_odata) 5 { 6 SharedMemory<T> smem; 7 T *sdata = smem.getPointer(); 8 // 以上两行结合,等效于 extern __shared__ T sdata[]; 9 const unsigned int tid = threadIdx.x; 10 11 sdata[tid] = g_idata[tid]; 12 __syncthreads(); 13 sdata[tid] = (T)blockDim.x * sdata[tid]; 14 __syncthreads(); 15 g_odata[tid] = sdata[tid]; 16 } 17 18 extern "C" __global__ void testFloat(float *p1, float *p2) { testKernel<float>(p1, p2); } 19 20 extern "C" __global__ void testInt(int *p1, int *p2) { testKernel<int>(p1, p2); }
1 // simpleTemplates.cpp 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_functions.h> 6 #include <nvrtc_helper.h> 7 #include <timer.h> 8 9 template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据 10 { 11 const T T_len = static_cast<T>(len);// 强制类型转换(const unsigned int -> T),并加上 const 限定 12 for (unsigned int i = 0; i < len; ++i) 13 reference[i] = idata[i] * T_len; 14 } 15 16 // ArrayComparator 的封装 17 template<class T> class ArrayComparator 18 { 19 public: 20 bool compare(const T *reference, T *data, unsigned int len) 21 { 22 fprintf(stderr, "Error: no comparison function implemented for this type\n"); 23 return false; 24 } 25 }; 26 // int 和 flaot 的实现,其中的函数 compareData() 定义于 helper_image.h 27 template<> class ArrayComparator<int> 28 { 29 public: 30 bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); } 31 }; 32 33 template<> class ArrayComparator<float> 34 { 35 public: 36 bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); } 37 }; 38 39 // ArrayFileWriter 的封装 40 template<class T> class ArrayFileWriter 41 { 42 public: 43 bool write(const char *filename, T *data, unsigned int len, float epsilon) 44 { 45 fprintf(stderr, "Error: no file write function implemented for this type\n"); 46 return false; 47 } 48 }; 49 // int 和 flaot 的实现,其中的函数 sdkWriteFile() 定义于 helper_image.h 50 template<> class ArrayFileWriter<int> 51 { 52 public: 53 bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); } 54 }; 55 56 template<> class ArrayFileWriter<float> 57 { 58 public: 59 bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); } 60 }; 61 62 // getKernel 的模板 63 template <typename T> CUfunction getKernel(CUmodule in); 64 65 template<> CUfunction getKernel<int>(CUmodule in) 66 { 67 CUfunction kernel_addr; 68 cuModuleGetFunction(&kernel_addr, in, "testInt"); 69 return kernel_addr; 70 } 71 72 template<> CUfunction getKernel<float>(CUmodule in) 73 { 74 CUfunction kernel_addr; 75 cuModuleGetFunction(&kernel_addr, in, "testFloat"); 76 return kernel_addr; 77 } 78 79 template<class T> bool test(int len) 80 { 81 // 与静态不同,编译 PTX 82 char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu"; 83 char *ptx; 84 size_t ptxSize; 85 compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0); // 1, NULL 分别为 argc 和 argv 86 CUmodule module = loadPTX(ptx, 1, NULL); // 1, NULL 分别为 argc 和 argv,有关于 GPU的输出 87 88 unsigned int mem_size = sizeof(T) * len; 89 dim3 grid(1, 1, 1); 90 dim3 threads(len, 1, 1); 91 ArrayComparator<T> comparator; 92 ArrayFileWriter<T> writer; 93 StartTimer(); 94 95 // 申请内存 96 T *h_idata, *h_odata; 97 CUdeviceptr d_idata, d_odata; // 与静态不同 98 h_idata = (T *)malloc(mem_size); 99 h_odata = (T *)malloc(mem_size); 100 cuMemAlloc(&d_idata, mem_size); // 与静态不同 101 cuMemAlloc(&d_odata, mem_size); 102 for (unsigned int i = 0; i < len; ++i) 103 h_idata[i] = (T)i; 104 cuMemcpyHtoD(d_idata, h_idata, mem_size); // 与静态不同 105 106 // 计算和计时 107 CUfunction kernel_addr = getKernel<T>(module); 108 109 void *arr[] = { (void *)&d_idata, (void *)&d_odata }; 110 cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, 0, &arr[0], 0); 111 cuCtxSynchronize(); // 上下文同步 112 cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同 113 printf("\n\tProcessing time: %f ms\n", GetTimer()); 114 115 // 检查结果 116 computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据 117 bool result = comparator.compare(h_idata, h_odata, len); 118 //writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分 119 120 free(h_idata); 121 free(h_odata); 122 cuMemFree(d_idata); // 与静态不同 123 cuMemFree(d_odata); 124 return result; 125 } 126 127 int main() 128 { 129 printf("\n\tStart.\n"); 130 printf("\n\t> test<float, 32>, result: %s.\n", test<float>(32) ? "Passed" : "Failed"); 131 printf("\n\t> test<int, 64>, result: %s.\n", test<int>(64) ? "Passed" : "Failed"); 132 133 getchar(); 134 return 0; 135 }
▶ 输出结果:
Start. > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Processing time: 0.699976 ms > test<float, 32>, result: Passed. > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Processing time: 0.665355 ms > test<int, 64>, result: Passed.
▶ 涨姿势
● 封装了 SharedMemory,ArrayComparator,ArrayFileWriter 三个模板,并定义了其在不同的数据类型下的实现。