0_Simple__matrixMulDrv - 爨爨爨好

公告

使用CUDA的 Driver API 来计算矩阵乘法。

▶ 源代码：

  1 #include <stdio.h>
  2 
  3 #include <cuda.h>
  4 #include <builtin_types.h>
  5 #include <helper_cuda_drvapi.h>
  6 #include <helper_timer.h>
  7 #include "matrixMul.h"
  8 
  9 #define PTX_FILE "matrixMul_kernel64.ptx"
 10 #define CUBIN_FILE "matrixMul_kernel64.cubin"
 11 
 12 const bool use_64bit_memory_address = true;
 13 using namespace std;
 14 
 15 CUdevice cuDevice;
 16 CUcontext cuContext;
 17 CUmodule cuModule;
 18 size_t totalGlobalMem;
 19 
 20 void constantInit(float *data, int size, float val)
 21 {
 22     for (int i = 0; i < size; ++i)
 23         data[i] = val;
 24 }
 25 
 26 bool inline findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source)
 27 {
 28     char *actual_path = sdkFindFilePath(module_file, argv[0]);// 依命令行的参数
 29 
 30     if (actual_path)
 31         module_path = actual_path;
 32     else
 33     {
 34         printf("> findModulePath file not found: <%s> \n", module_file);
 35         return false;
 36     }
 37 
 38     if (module_path.empty())
 39     {
 40         printf("> findModulePath file not found: <%s> \n", module_file);
 41         return false;
 42     }
 43     printf("> findModulePath <%s>\n", module_path.c_str());
 44 
 45     if (module_path.rfind(".ptx") != string::npos)
 46     {
 47         FILE *fp = fopen(module_path.c_str(), "rb");
 48         fseek(fp, 0, SEEK_END);
 49         int file_size = ftell(fp);
 50         char *buf = new char[file_size + 1];
 51         fseek(fp, 0, SEEK_SET);
 52         fread(buf, sizeof(char), file_size, fp);
 53         fclose(fp);
 54         buf[file_size] = '\0';
 55         ptx_source = buf;
 56         delete[] buf;
 57     }
 58     return true;
 59 }
 60 
 61 static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul)
 62 {
 63     CUfunction cuFunction = 0;// 用于存放取出的函数
 64     CUresult status;          // 记录每一步操作返回的状态，有false时立即用goto语句转到函数末尾退出
 65     int major = 0, minor = 0;
 66     char deviceName[100];
 67     string module_path, ptx_source;
 68 
 69     cuDevice = findCudaDeviceDRV(argc, (const char **)argv);// 寻找设备，依命令行参数指定或者选择计算能力最高的
 70     cuDeviceComputeCapability(&major, &minor, cuDevice);
 71     cuDeviceGetName(deviceName, 256, cuDevice);
 72     printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
 73     cuDeviceTotalMem(&totalGlobalMem, cuDevice);            // 获取显存总量 
 74     printf("  Total amount of global memory:     %llu bytes\n", (unsigned long long)totalGlobalMem);
 75     printf("  64-bit Memory Address:             %s\n", (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO");
 76 
 77     status = cuCtxCreate(&cuContext, 0, cuDevice);          // 创建上下文
 78     if (CUDA_SUCCESS != status)
 79         goto Error;
 80 
 81     if (!findModulePath(PTX_FILE, module_path, argv, ptx_source))// 查找指定的模块 "matrixMul_kernel64.ptx"
 82     {
 83         if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source))// 查找模块 "matrixMul_kernel64.cubin"
 84         {
 85             printf("> findModulePath could not find <matrixMul_kernel> ptx or cubin\n");
 86             status = CUDA_ERROR_NOT_FOUND;
 87             goto Error;
 88         }
 89     }
 90     else
 91         printf("> initCUDA loading module: <%s>\n", module_path.c_str());
 92 
 93     if (module_path.rfind("ptx") != string::npos)
 94     {
 95         // in this branch we use compilation with parameters
 96         const unsigned int jitNumOptions = 3;
 97         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 98         void **jitOptVals = new void *[jitNumOptions];
 99 
100         // set up size of compilation log buffer
101         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
102         int jitLogBufferSize = 1024;
103         jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
104 
105         // set up pointer to the compilation log buffer
106         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
107         char *jitLogBuffer = new char[jitLogBufferSize];
108         jitOptVals[1] = jitLogBuffer;
109 
110         // set up pointer to set the Maximum # of registers for a particular kernel
111         jitOptions[2] = CU_JIT_MAX_REGISTERS;
112         int jitRegCount = 32;
113         jitOptVals[2] = (void *)(size_t)jitRegCount;
114 
115         // 编译模块
116         status = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
117 
118         printf("> PTX JIT log:\n%s\n", jitLogBuffer);
119     }
120     else
121         status = cuModuleLoad(&cuModule, module_path.c_str());
122 
123     if (CUDA_SUCCESS != status)
124         goto Error;
125 
126     // 取出函数
127     if (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L)
128         status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit");
129     else
130         status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_32bit");
131 
132     if (CUDA_SUCCESS != status)
133         goto Error;
134     *pMatrixMul = cuFunction;
135     return CUDA_SUCCESS;
136 
137 Error:
138     cuCtxDestroy(cuContext);
139     return status;
140 }
141 
142 void runTest(int argc, char **argv)
143 {
144     int block_size = 32;
145 
146     // 获取计算函数
147     CUfunction matrixMul = NULL;// CUDA 函数指针
148     CUresult error_id = initCUDA(argc, argv, &matrixMul);// 获取函数
149 
150     // 数据准备工作
151     unsigned int size_A = WA * HA;
152     unsigned int mem_size_A = sizeof(float) * size_A;
153     float *h_A = (float *) malloc(mem_size_A);
154     unsigned int size_B = WB * HB;
155     unsigned int mem_size_B = sizeof(float) * size_B;
156     float *h_B = (float *) malloc(mem_size_B);
157     size_t size_C = WC * HC;
158     size_t mem_size_C = sizeof(float) * size_C;
159     float *h_C = (float *)malloc(mem_size_C);
160     constantInit(h_A, size_A, 1.0f);    // 全 1 阵
161     constantInit(h_B, size_B, 0.01f);   // 全0.01 阵
162 
163     // 如果是64位系统，则这里申请四块1G的显存占着，没啥用
164     CUdeviceptr d_Mem[4];
165     if (use_64bit_memory_address)
166     {
167         unsigned int mem_size = 1024*1024*1024;
168         cuMemAlloc(&d_Mem[0], mem_size);
169         cuMemAlloc(&d_Mem[1], mem_size);
170         cuMemAlloc(&d_Mem[2], mem_size);
171         cuMemAlloc(&d_Mem[3], mem_size);
172     }
173 
174     CUdeviceptr d_A;
175     cuMemAlloc(&d_A, mem_size_A);
176     CUdeviceptr d_B;
177     cuMemAlloc(&d_B, mem_size_B);
178     CUdeviceptr d_C;
179     cuMemAlloc(&d_C, mem_size_C);
180     cuMemcpyHtoD(d_A, h_A, mem_size_A);
181     cuMemcpyHtoD(d_B, h_B, mem_size_B);
182 
183     // 计时相关
184     StopWatchInterface *timer = NULL;
185     sdkCreateTimer(&timer);
186     sdkStartTimer(&timer);
187 
188     dim3 block(block_size, block_size, 1);
189     dim3 grid(WC / block_size, HC / block_size, 1);
190 
191     // 两种方式调用 Driver API
192     if (1)
193     {                                       
194         // 64位内存地址且显存足够大，使用 size_t 为尺寸格式，否则使用 int 为尺寸格式，其调用格式相同 
195         if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)4*1024*1024*1024L))
196         {
197             size_t Matrix_Width_A = (size_t)WA;
198             size_t Matrix_Width_B = (size_t)WB;
199             void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
200             // CUDA 4.0 Driver API 核函数调用，使用倒数第二个指针参数
201             cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
202                            2 * block_size*block_size * sizeof(float), NULL, args, NULL);
203         }
204         else
205         {
206             int Matrix_Width_A = WA;
207             int Matrix_Width_B = WB;
208             void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
209             cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
210                            2 * block_size*block_size * sizeof(float), NULL, args, NULL);
211         }
212     }
213     else
214     {
215         int offset = 0;
216         char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数，用 offset 作偏移
217 
218         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
219         offset += sizeof(d_C);
220         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
221         offset += sizeof(d_A);
222         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
223         offset += sizeof(d_B);
224 
225         if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)4*1024*1024*1024L))
226         {
227             size_t Matrix_Width_A = (size_t)WA;
228             size_t Matrix_Width_B = (size_t)WB;
229             *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
230             offset += sizeof(Matrix_Width_A);
231             *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
232             offset += sizeof(Matrix_Width_B);
233         }
234         else
235         {
236             int Matrix_Width_A = WA;
237             int Matrix_Width_B = WB;
238             *((int *)&argBuffer[offset]) = Matrix_Width_A;
239             offset += sizeof(Matrix_Width_A);
240             *((int *)&argBuffer[offset]) = Matrix_Width_B;
241             offset += sizeof(Matrix_Width_B);
242         }
243 
244         // 用一个 void * 来封装上面5个参数，并加上参数尺寸和一个指明参数结束的结束宏
245         void *kernel_launch_config[5] =
246         {
247             CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
248             CU_LAUNCH_PARAM_BUFFER_SIZE,    &offset,
249             CU_LAUNCH_PARAM_END
250         };
251 
252         // CUDA 4.0 Driver API 核函数调用，使用最后一个指针参数
253         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
254                        2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
255     }
256 
257     cuMemcpyDtoH((void *) h_C, d_C, mem_size_C);
258 
259     sdkStopTimer(&timer);
260     printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
261     sdkDeleteTimer(&timer);
262 
263     //检查结果
264     printf("Checking computed result for correctness: ");
265     bool correct = true;
266     for (int i = 0; i < (int)(WC * HC); i++)
267     {
268         if (fabs(h_C[i] - (WA * 0.01f)) > 1e-5)
269         {
270             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA*0.01f);
271             correct = false;
272         }
273     }
274     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
275     printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
276 
277     if (use_64bit_memory_address)
278     {
279         cuMemFree(d_Mem[0]);
280         cuMemFree(d_Mem[1]);
281         cuMemFree(d_Mem[2]);
282         cuMemFree(d_Mem[3]);
283     }
284     free(h_A);
285     free(h_B);
286     free(h_C);
287     cuMemFree(d_A);
288     cuMemFree(d_B);
289     cuMemFree(d_C);
290     cuCtxDestroy(cuContext);
291 }
292 
293 int main(int argc, char **argv)
294 {
295     printf("[ matrixMulDrv(Driver API) ]\n");
296     runTest(argc, argv);
297 
298     getchar();
299     return 0;
300 }

▶ 输出结果：

[ matrixMulDrv (Driver API) ]
> Using CUDA Device [0]: GeForce GTX 1070
> GPU Device has SM 6.1 compute capability
Total amount of global memory:     8589934592 bytes
64-bit Memory Address:             YES
sdkFindFilePath <matrixMul_kernel64.ptx> in ./
sdkFindFilePath <matrixMul_kernel64.ptx> in ./../../bin/win64/Debug/matrixMulDrv_data_files/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/data/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./data/
> findModulePath <./data/matrixMul_kernel64.ptx>
> initCUDA loading module: <./data/matrixMul_kernel64.ptx>
> PTX JIT log:

Processing time: 0.568077 (ms)
Checking computed result for correctness: Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势：

● 头文件 matrixMul.h 的内容：

 1 #ifndef _MATRIXMUL_H_
 2 #define _MATRIXMUL_H_
 3 
 4 // 规定了参与计算的矩阵的维数
 5 #define WA        (4 * block_size)
 6 #define HA         (6 * block_size)
 7 #define WB        (4 * block_size)
 8 #define HB WA
 9 #define WC WB
10 #define HC HA
11 
12 #endif // _MATRIXMUL_H_

● C++ 中 string 类的基本使用方法

1 using namespace std;
2 
3 string buf, buf2;
4 int n;
5 char *buf = new char[n];// 动态创建字符数组大小，类似malloc
6 buf[n - 1] = '\0';      // 手动结尾补零
7 buf2 = buf;             // 直接赋值
8 delete[] buf;           // 删除该数组，类似 free

● class StopWatchInterface ，定义于 helper_timer.h 中用于计时的一个类，这里只说明其使用方法，其内容在头文件随笔中详细讨论。

1 StopWatchInterface *timer = NULL;   // 创建计时类指针   
2 sdkCreateTimer(&timer);             // 创建计时类
3 sdkStartTimer(&timer);              // 开始计时
4 
5 ...                                 // 核函数运行过程
6 
7 sdkStopTimer(&timer);               // 停止计时
8 sdkGetTimerValue(&timer);           // 获取时间（返回浮点类型的毫秒数）
9 sdkDeleteTimer(&timer);             // 删除计时类

● cuda.h 中各种定义

typedef int CUdevice;                      // CUDA int 类型，用于标志设备号
typedef struct CUfunc_st *CUfunction;      // CUDA 函数指针
typedef struct CUmod_st *CUmodule;         // CUDA 模块指针
typedef struct CUctx_st *CUcontext;        // CUDA 上下文指针
typedef enum cudaError_enum {...}CUresult; // CUDA 各种错误信息标号
typedef unsigned long long CUdeviceptr;    // 无符号长长整型

CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);// 获取设备名称

CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);   // 获取设备计算能力

inline CUdevice findCudaDeviceDRV(int argc, const char **argv);     // 依命令行指定设备，否则选择计算能力最高的设备。内含函数调用 cuInit(0)

#define cuDeviceTotalMem cuDeviceTotalMem_v2                        // 获取显存大小
CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);

#define cuMemAlloc cuMemAlloc_v2                                    // 申请显存
CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);    

#define cuMemFree cuMemFree_v2                                      // 释放显存
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);

CUresult CUDAAPI cuInit(unsigned int Flags);                        // 重要的初始化设备参数，在创建上下文之前要先调用它，参数可以设为 0

#define cuCtxCreate cuCtxCreate_v2                                  // 创建上下文
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);

#define cuCtxDestroy cuCtxDestroy_v2                                // 销毁上下文
CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);                       

#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2)   // cudaMemcpy(cudaMemcpyHostToDevice)的别名
#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2)   // cudaMemcpy(cudaMemcpyDeviceToHost)的别名
#define __CUDA_API_PTDS(api) api

// 从 ptx 流 image 中编译模块 module，并且包括 numOptions 个参数，参数名列表为 options，参数值列表为 optionValues 
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);

// 指定路径 fname 中获取模块 module
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);

// 从指定模块 hmod 中获取名为 name 的函赋给函数指针 hfunc
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);

● 代码中使用了 goto 语句，基本使用过程如下。好处是函数整个函数 initCUDA 中只有一个 return，坏处是到处是跳转。

 1 int function()
 2 {
 3     CUresult status;
 4 
 5     status = cudaFunction();
 6     if (!status == CUDA_SUCCESS)// 函数cudaFunction运行不正常 
 7         goto Error;
 8 
 9     ...                         // 函数运行正常
10 
11     return 0;                   // 正常结束，返回 0
12 Error:                          
13     return status;              // 非正常结束，返回首个错误编号
14 }

● Driver API 的简略使用过程。本篇源代码很长，但是压缩后可以变成以下内容，方便看出该接口函数的使用过程。

  1 #include <stdio.h>
  2 #include <cuda.h>
  3 #include <builtin_types.h>
  4 #include <helper_cuda_drvapi.h>
  5 #include <helper_timer.h>
  6 
  7 int main()
  8 {
  9     // 常量
 10     CUdevice cuDevice = 0;
 11     CUcontext cuContext;
 12     CUmodule cuModule;
 13     CUfunction matrixMul = NULL;
 14     CUresult status;
 15     char module_path[30] = "./data/matrixMul_kernel64.ptx";
 16     char ptx_source[63894];
 17 
 18     // 创建上下文
 19     cuInit(0);
 20     status = cuCtxCreate(&cuContext, 0, cuDevice);
 21 
 22     // 获取函数
 23     FILE *fp = fopen(module_path, "rb");
 24     fseek(fp, 0, SEEK_END);
 25     int file_size = ftell(fp);
 26     fseek(fp, 0, SEEK_SET);
 27     fread(ptx_source, sizeof(char), file_size, fp);
 28     ptx_source[63894 - 1] = '\0';
 29 
 30     // 设置编译选项
 31     const unsigned int jitNumOptions = 3;
 32     CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 33     void **jitOptVals = new void *[jitNumOptions];
 34 
 35     // 编译日志大小
 36     jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 37     int jitLogBufferSize = 1024;
 38     jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
 39 
 40     // 编译日志的指针
 41     jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
 42     char *jitLogBuffer = new char[jitLogBufferSize];
 43     jitOptVals[1] = jitLogBuffer;
 44 
 45     // 单核函数寄存器数量
 46     jitOptions[2] = CU_JIT_MAX_REGISTERS;
 47     int jitRegCount = 32;
 48     jitOptVals[2] = (void *)(size_t)jitRegCount;
 49 
 50     // 编译模块
 51     status = cuModuleLoadDataEx(&cuModule, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals);
 52     printf("\nPTX JIT log:\n%s\n", jitLogBuffer);
 53     status = cuModuleGetFunction(&matrixMul, cuModule, "matrixMul_bs32_64bit");
 54 
 55     // 数据准备工作
 56     int block_size = 32;
 57     int wa = 4 * block_size;
 58     int ha = 6 * block_size;
 59     int wb = 4 * block_size;
 60     int hb = wa;
 61     int wc = wb;
 62     int hc = ha;
 63 
 64     unsigned int size_A = wa * ha;
 65     unsigned int mem_size_A = sizeof(float) * size_A;
 66     float *h_A = (float *)malloc(mem_size_A);
 67     unsigned int size_B = wb * hb;
 68     unsigned int mem_size_B = sizeof(float) * size_B;
 69     float *h_B = (float *)malloc(mem_size_B);
 70     size_t size_C = wc * hc;
 71     size_t mem_size_C = sizeof(float) * size_C;
 72     float *h_C = (float *)malloc(mem_size_C);
 73 
 74     for (int i = 0; i < size_A; ++i)
 75         h_A[i] = 1.0f;
 76     for (int i = 0; i < size_B; ++i)
 77         h_B[i] = 0.01f;
 78 
 79     CUdeviceptr d_A;
 80     cuMemAlloc(&d_A, mem_size_A);
 81     CUdeviceptr d_B;
 82     cuMemAlloc(&d_B, mem_size_B);
 83     CUdeviceptr d_C;
 84     cuMemAlloc(&d_C, mem_size_C);
 85     cuMemcpyHtoD(d_A, h_A, mem_size_A);
 86     cuMemcpyHtoD(d_B, h_B, mem_size_B);
 87 
 88     dim3 block(block_size, block_size, 1);
 89     dim3 grid(wc / block_size, hc / block_size, 1);
 90 
 91     // 两种方式调用 Driver API
 92     if (1)
 93     {
 94         size_t Matrix_Width_A = (size_t)wa;
 95         size_t Matrix_Width_B = (size_t)wb;
 96         void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
 97         // CUDA 4.0 Driver API 核函数调用，使用倒数第二个指针参数
 98         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
 99             2 * block_size*block_size * sizeof(float), NULL, args, NULL);
100     }
101     else
102     {
103         int offset = 0;
104         char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数，用 offset 作偏移
105 
106         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
107         offset += sizeof(d_C);
108         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
109         offset += sizeof(d_A);
110         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
111         offset += sizeof(d_B);
112         size_t Matrix_Width_A = (size_t)wa;
113         size_t Matrix_Width_B = (size_t)wb;
114         *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
115         offset += sizeof(Matrix_Width_A);
116         *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
117         offset += sizeof(Matrix_Width_B);
118 
119         // 用一个 void * 来封装上面5个参数，并加上参数尺寸和一个指明参数结束的结束宏
120         void *kernel_launch_config[5] =
121         {
122             CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
123             CU_LAUNCH_PARAM_BUFFER_SIZE,    &offset,
124             CU_LAUNCH_PARAM_END
125         };
126 
127         // CUDA 4.0 Driver API 核函数调用，使用最后一个指针参数
128         cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
129             2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
130     }
131 
132     cuMemcpyDtoH((void *)h_C, d_C, mem_size_C);
133 
134     //检查结果
135     printf("Checking computed result for correctness: ");
136     bool correct = true;
137     for (int i = 0; i < (int)(wc * hc); i++)
138     {
139         if (fabs(h_C[i] - (wa * 0.01f)) > 1e-5)
140         {
141             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], wa*0.01f);
142             correct = false;
143         }
144     }
145     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
146 
147     free(h_A);
148     free(h_B);
149     free(h_C);
150     cuMemFree(d_A);
151     cuMemFree(d_B);
152     cuMemFree(d_C);
153     cuCtxDestroy(cuContext);
154 
155     getchar();
156     return 0;
157 }

posted on 2017-11-02 21:58 爨爨爨好阅读(1003) 评论(0) 编辑收藏举报

刷新页面返回顶部