0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc - 爨爨爨好

公告

0_Simple__simpleTemplates + 0_Simple__simpleTemplates_nvrtc

使用 C++ 的模板

▶ 源代码：静态使用

  1 // sharedmem.cuh
  2 #ifndef _SHAREDMEM_H_
  3 #define _SHAREDMEM_H_
  4 
  5 // SharedMemory 的封装
  6 template <typename T> struct SharedMemory
  7 {
  8     __device__ T *getPointer()
  9     {
 10         extern __device__ void error(void);
 11         error();
 12         return NULL;
 13     }
 14 };
 15 
 16 // SharedMemory 的各种数据类型的实现 
 17 template <> struct SharedMemory <int>
 18 {
 19     __device__ int *getPointer()
 20     {
 21         extern __shared__ int s_int[];
 22         return s_int;
 23     }
 24 };
 25 
 26 template <> struct SharedMemory <unsigned int>
 27 {
 28     __device__ unsigned int *getPointer()
 29     {
 30         extern __shared__ unsigned int s_uint[];
 31         return s_uint;
 32     }
 33 };
 34 
 35 template <> struct SharedMemory <char>
 36 {
 37     __device__ char *getPointer()
 38     {
 39         extern __shared__ char s_char[];
 40         return s_char;
 41     }
 42 };
 43 
 44 template <> struct SharedMemory <unsigned char>
 45 {
 46     __device__ unsigned char *getPointer()
 47     {
 48         extern __shared__ unsigned char s_uchar[];
 49         return s_uchar;
 50     }
 51 };
 52 
 53 template <> struct SharedMemory <short>
 54 {
 55     __device__ short *getPointer()
 56     {
 57         extern __shared__ short s_short[];
 58         return s_short;
 59     }
 60 };
 61 
 62 template <> struct SharedMemory <unsigned short>
 63 {
 64     __device__ unsigned short *getPointer()
 65     {
 66         extern __shared__ unsigned short s_ushort[];
 67         return s_ushort;
 68     }
 69 };
 70 
 71 template <> struct SharedMemory <long>
 72 {
 73     __device__ long *getPointer()
 74     {
 75         extern __shared__ long s_long[];
 76         return s_long;
 77     }
 78 };
 79 
 80 template <> struct SharedMemory <unsigned long>
 81 {
 82     __device__ unsigned long *getPointer()
 83     {
 84         extern __shared__ unsigned long s_ulong[];
 85         return s_ulong;
 86     }
 87 };
 88 
 89 template <> struct SharedMemory <bool>
 90 {
 91     __device__ bool *getPointer()
 92     {
 93         extern __shared__ bool s_bool[];
 94         return s_bool;
 95     }
 96 };
 97 
 98 template <> struct SharedMemory <float>
 99 {
100     __device__ float *getPointer()
101     {
102         extern __shared__ float s_float[];
103         return s_float;
104     }
105 };
106 
107 template <> struct SharedMemory <double>
108 {
109     __device__ double *getPointer()
110     {
111         extern __shared__ double s_double[];
112         return s_double;
113     }
114 };
115 
116 #endif

  1 // simpleTemplates.cu
  2 #include <stdio.h>
  3 #include <timer.h>
  4 #include <cuda_runtime.h>
  5 #include "device_launch_parameters.h"
  6 #include <helper_functions.h>
  7 #include <helper_cuda.h>
  8 #include "sharedmem.cuh"
  9 
 10 template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
 11 {
 12     SharedMemory<T> smem;
 13     T *sdata = smem.getPointer();
 14     // 以上两行结合，等效于 extern __shared__  T sdata[];
 15     const unsigned int tid = threadIdx.x;
 16 
 17     sdata[tid] = g_idata[tid];
 18     __syncthreads();
 19     sdata[tid] = (T) blockDim.x * sdata[tid];
 20     __syncthreads();
 21     g_odata[tid] = sdata[tid];
 22 }
 23 
 24 template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
 25 {
 26     const T T_len = static_cast<T>(len);// 强制类型转换（const unsigned int -> T），并加上 const 限定
 27     for (unsigned int i = 0; i < len; ++i)
 28         reference[i] = idata[i] * T_len;
 29 }
 30 
 31 // ArrayComparator 的封装
 32 template<class T> class ArrayComparator
 33 {
 34     public:
 35         bool compare(const T *reference, T *data, unsigned int len)
 36         {
 37             fprintf(stderr, "Error: no comparison function implemented for this type\n");
 38             return false;
 39         }
 40 };
 41 // int 和 flaot 的实现，其中的函数 compareData() 定义于 helper_image.h
 42 template<> class ArrayComparator<int>
 43 {
 44     public:
 45         bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
 46 };
 47 
 48 template<> class ArrayComparator<float>
 49 {
 50     public:
 51         bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
 52 };
 53 
 54 // ArrayFileWriter 的封装
 55 template<class T> class ArrayFileWriter
 56 {
 57     public:
 58         bool write(const char *filename, T *data, unsigned int len, float epsilon)
 59         {
 60             fprintf(stderr, "Error: no file write function implemented for this type\n");
 61             return false;
 62         }
 63 };
 64 // int 和 flaot 的实现，其中的函数 sdkWriteFile() 定义于 helper_image.h
 65 template<> class ArrayFileWriter<int>
 66 {
 67     public:
 68         bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
 69 };
 70 
 71 template<> class ArrayFileWriter<float>
 72 {
 73     public:
 74         bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
 75 };
 76 
 77 template<class T> bool test(int len)
 78 {
 79     unsigned int mem_size = sizeof(T) * len;
 80     dim3  grid(1, 1, 1);
 81     dim3  threads(len, 1, 1);
 82     ArrayComparator<T> comparator;
 83     ArrayFileWriter<T> writer;
 84     cudaSetDevice(0);
 85     StartTimer();
 86     
 87     // 申请内存
 88     T *h_idata, *h_odata, *d_idata, *d_odata;
 89     h_idata = (T *)malloc(mem_size);
 90     h_odata = (T *)malloc(mem_size);
 91     cudaMalloc((void **)&d_idata, mem_size);
 92     cudaMalloc((void **)&d_odata, mem_size);
 93     for (unsigned int i = 0; i < len; ++i)
 94         h_idata[i] = (T) i;
 95     cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
 96     
 97     // 计算和计时
 98     testKernel<T> << < grid, threads, mem_size >> > (d_idata, d_odata);
 99     cudaMemcpy(h_odata, d_odata, sizeof(T) * len, cudaMemcpyDeviceToHost);
100     printf("\n\tProcessing time: %f ms\n", GetTimer());
101 
102     // 检查结果
103     computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
104     bool result = comparator.compare(h_idata, h_odata, len);
105     //writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);// 写入文件的部分
106     
107     free(h_idata);
108     free(h_odata);
109     cudaFree(d_idata);
110     cudaFree(d_odata);
111     return result;
112 }
113 
114 int main()
115 {
116     printf("\n\tStart.\n");
117     printf("\n\t> test<float, 32>, result: %s.\n", test<float>(32) ? "Passed" : "Failed");
118     printf("\n\t> test<float, 64>, result: %s.\n", test<float>(64) ? "Passed" : "Failed");
119 
120     getchar();
121     return 0;
122 }

▶ 输出结果：

    Start.

    Processing time: 107.394216 ms

    > test<float, 32>, result: Passed.

    Processing time: 3.153182 ms

    > test<float, 64>, result: Passed.

▶ 源代码：使用运行时编译

1 // sharedmem.cuh，与静态完全相同

 1 // simpleTemplates_kernel.cu
 2 #include "sharedmem.cuh"
 3 
 4 template<class T> __global__ void testKernel(T *g_idata, T *g_odata)
 5 {
 6     SharedMemory<T> smem;
 7     T *sdata = smem.getPointer();
 8     // 以上两行结合，等效于 extern __shared__  T sdata[];
 9     const unsigned int tid = threadIdx.x;
10 
11     sdata[tid] = g_idata[tid];
12     __syncthreads();
13     sdata[tid] = (T)blockDim.x * sdata[tid];
14     __syncthreads();
15     g_odata[tid] = sdata[tid];
16 }
17 
18 extern "C" __global__ void testFloat(float *p1, float *p2) {  testKernel<float>(p1, p2); }
19 
20 extern "C" __global__ void testInt(int *p1, int *p2) {  testKernel<int>(p1, p2); }

  1 // simpleTemplates.cpp
  2 #include <stdio.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_functions.h>
  6 #include <nvrtc_helper.h>
  7 #include <timer.h>
  8 
  9 template<class T> void computeGold(T *reference, T *idata, const unsigned int len)// 生成理论结果数据
 10 {
 11     const T T_len = static_cast<T>(len);// 强制类型转换（const unsigned int -> T），并加上 const 限定
 12     for (unsigned int i = 0; i < len; ++i)
 13         reference[i] = idata[i] * T_len;
 14 }
 15 
 16 // ArrayComparator 的封装
 17 template<class T> class ArrayComparator
 18 {
 19 public:
 20     bool compare(const T *reference, T *data, unsigned int len)
 21     {
 22         fprintf(stderr, "Error: no comparison function implemented for this type\n");
 23         return false;
 24     }
 25 };
 26 // int 和 flaot 的实现，其中的函数 compareData() 定义于 helper_image.h
 27 template<> class ArrayComparator<int>
 28 {
 29 public:
 30     bool compare(const int *reference, int *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.0f); }
 31 };
 32 
 33 template<> class ArrayComparator<float>
 34 {
 35 public:
 36     bool compare(const float *reference, float *data, unsigned int len) { return compareData(reference, data, len, 0.15f, 0.15f); }
 37 };
 38 
 39 // ArrayFileWriter 的封装
 40 template<class T> class ArrayFileWriter
 41 {
 42 public:
 43     bool write(const char *filename, T *data, unsigned int len, float epsilon)
 44     {
 45         fprintf(stderr, "Error: no file write function implemented for this type\n");
 46         return false;
 47     }
 48 };
 49 // int 和 flaot 的实现，其中的函数 sdkWriteFile() 定义于 helper_image.h
 50 template<> class ArrayFileWriter<int>
 51 {
 52 public:
 53     bool write(const char *filename, int *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
 54 };
 55 
 56 template<> class ArrayFileWriter<float>
 57 {
 58 public:
 59     bool write(const char *filename, float *data, unsigned int len, float epsilon) { return sdkWriteFile(filename, data, len, epsilon, false); }
 60 };
 61 
 62 // getKernel 的模板
 63 template <typename T> CUfunction getKernel(CUmodule in);
 64 
 65 template<> CUfunction getKernel<int>(CUmodule in)
 66 {
 67     CUfunction kernel_addr;
 68     cuModuleGetFunction(&kernel_addr, in, "testInt");
 69     return kernel_addr;
 70 }
 71 
 72 template<> CUfunction getKernel<float>(CUmodule in)
 73 {
 74     CUfunction kernel_addr;
 75     cuModuleGetFunction(&kernel_addr, in, "testFloat");
 76     return kernel_addr;
 77 }
 78                                      
 79 template<class T> bool test(int len)
 80 {
 81     // 与静态不同，编译 PTX
 82     char *kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\simpleTemplates_nvrtc\\simpleTemplates_kernel.cu";
 83     char *ptx;
 84     size_t ptxSize;
 85     compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);  // 1, NULL 分别为 argc 和 argv
 86     CUmodule module = loadPTX(ptx, 1, NULL);                    // 1, NULL 分别为 argc 和 argv，有关于 GPU的输出
 87 
 88     unsigned int mem_size = sizeof(T) * len;
 89     dim3  grid(1, 1, 1);
 90     dim3  threads(len, 1, 1);
 91     ArrayComparator<T> comparator;
 92     ArrayFileWriter<T> writer;
 93     StartTimer();
 94 
 95     // 申请内存
 96     T *h_idata, *h_odata;
 97     CUdeviceptr d_idata, d_odata;                   // 与静态不同
 98     h_idata = (T *)malloc(mem_size);
 99     h_odata = (T *)malloc(mem_size);
100     cuMemAlloc(&d_idata, mem_size);                 // 与静态不同
101     cuMemAlloc(&d_odata, mem_size);
102     for (unsigned int i = 0; i < len; ++i)
103         h_idata[i] = (T)i;
104     cuMemcpyHtoD(d_idata, h_idata, mem_size);       // 与静态不同
105 
106     // 计算和计时
107     CUfunction kernel_addr = getKernel<T>(module);
108 
109     void *arr[] = { (void *)&d_idata, (void *)&d_odata };
110     cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, threads.x, threads.y, threads.z, mem_size, 0, &arr[0], 0);
111     cuCtxSynchronize();                             // 上下文同步
112     cuMemcpyDtoH(h_odata, d_odata, sizeof(T) * len);// 与静态不同
113     printf("\n\tProcessing time: %f ms\n", GetTimer());
114 
115     // 检查结果
116     computeGold<T>(h_idata, h_idata, len);// 生成理论结果数据
117     bool result = comparator.compare(h_idata, h_odata, len);
118     //writer.write("./data/regression.dat", h_odata, len, 0.0f);// 写入文件的部分
119 
120     free(h_idata);
121     free(h_odata);
122     cuMemFree(d_idata);                             // 与静态不同
123     cuMemFree(d_odata);
124     return result;
125 }
126 
127 int main()
128 {
129     printf("\n\tStart.\n");
130     printf("\n\t> test<float, 32>, result: %s.\n", test<float>(32) ? "Passed" : "Failed");
131     printf("\n\t> test<int, 64>, result: %s.\n", test<int>(64) ? "Passed" : "Failed");
132 
133     getchar();
134     return 0;
135 }

▶ 输出结果：

    Start.
> Using CUDA Device [0]: GeForce GTX 1070
> GPU Device has SM 6.1 compute capability

    Processing time: 0.699976 ms

    > test<float, 32>, result: Passed.
> Using CUDA Device [0]: GeForce GTX 1070
> GPU Device has SM 6.1 compute capability

    Processing time: 0.665355 ms

    > test<int, 64>, result: Passed.

▶ 涨姿势

● 封装了 SharedMemory，ArrayComparator，ArrayFileWriter 三个模板，并定义了其在不同的数据类型下的实现。

posted on 2017-12-02 22:59 爨爨爨好阅读(238) 评论(0) 编辑收藏举报

努力加载评论中...

刷新页面返回顶部