使用CUDA的 Driver API 来计算矩阵乘法。
▶ 源代码:
1 #include <stdio.h> 2 3 #include <cuda.h> 4 #include <builtin_types.h> 5 #include <helper_cuda_drvapi.h> 6 #include <helper_timer.h> 7 #include "matrixMul.h" 8 9 #define PTX_FILE "matrixMul_kernel64.ptx" 10 #define CUBIN_FILE "matrixMul_kernel64.cubin" 11 12 const bool use_64bit_memory_address = true; 13 using namespace std; 14 15 CUdevice cuDevice; 16 CUcontext cuContext; 17 CUmodule cuModule; 18 size_t totalGlobalMem; 19 20 void constantInit(float *data, int size, float val) 21 { 22 for (int i = 0; i < size; ++i) 23 data[i] = val; 24 } 25 26 bool inline findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source) 27 { 28 char *actual_path = sdkFindFilePath(module_file, argv[0]);// 依命令行的参数 29 30 if (actual_path) 31 module_path = actual_path; 32 else 33 { 34 printf("> findModulePath file not found: <%s> \n", module_file); 35 return false; 36 } 37 38 if (module_path.empty()) 39 { 40 printf("> findModulePath file not found: <%s> \n", module_file); 41 return false; 42 } 43 printf("> findModulePath <%s>\n", module_path.c_str()); 44 45 if (module_path.rfind(".ptx") != string::npos) 46 { 47 FILE *fp = fopen(module_path.c_str(), "rb"); 48 fseek(fp, 0, SEEK_END); 49 int file_size = ftell(fp); 50 char *buf = new char[file_size + 1]; 51 fseek(fp, 0, SEEK_SET); 52 fread(buf, sizeof(char), file_size, fp); 53 fclose(fp); 54 buf[file_size] = '\0'; 55 ptx_source = buf; 56 delete[] buf; 57 } 58 return true; 59 } 60 61 static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul) 62 { 63 CUfunction cuFunction = 0;// 用于存放取出的函数 64 CUresult status; // 记录每一步操作返回的状态,有false时立即用goto语句转到函数末尾退出 65 int major = 0, minor = 0; 66 char deviceName[100]; 67 string module_path, ptx_source; 68 69 cuDevice = findCudaDeviceDRV(argc, (const char **)argv);// 寻找设备,依命令行参数指定或者选择计算能力最高的 70 cuDeviceComputeCapability(&major, &minor, cuDevice); 71 cuDeviceGetName(deviceName, 256, cuDevice); 72 printf("> GPU Device has SM %d.%d compute capability\n", major, minor); 73 cuDeviceTotalMem(&totalGlobalMem, cuDevice); // 获取显存总量 74 printf(" Total amount of global memory: %llu bytes\n", (unsigned long long)totalGlobalMem); 75 printf(" 64-bit Memory Address: %s\n", (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO"); 76 77 status = cuCtxCreate(&cuContext, 0, cuDevice); // 创建上下文 78 if (CUDA_SUCCESS != status) 79 goto Error; 80 81 if (!findModulePath(PTX_FILE, module_path, argv, ptx_source))// 查找指定的模块 "matrixMul_kernel64.ptx" 82 { 83 if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source))// 查找模块 "matrixMul_kernel64.cubin" 84 { 85 printf("> findModulePath could not find <matrixMul_kernel> ptx or cubin\n"); 86 status = CUDA_ERROR_NOT_FOUND; 87 goto Error; 88 } 89 } 90 else 91 printf("> initCUDA loading module: <%s>\n", module_path.c_str()); 92 93 if (module_path.rfind("ptx") != string::npos) 94 { 95 // in this branch we use compilation with parameters 96 const unsigned int jitNumOptions = 3; 97 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; 98 void **jitOptVals = new void *[jitNumOptions]; 99 100 // set up size of compilation log buffer 101 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 102 int jitLogBufferSize = 1024; 103 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; 104 105 // set up pointer to the compilation log buffer 106 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 107 char *jitLogBuffer = new char[jitLogBufferSize]; 108 jitOptVals[1] = jitLogBuffer; 109 110 // set up pointer to set the Maximum # of registers for a particular kernel 111 jitOptions[2] = CU_JIT_MAX_REGISTERS; 112 int jitRegCount = 32; 113 jitOptVals[2] = (void *)(size_t)jitRegCount; 114 115 // 编译模块 116 status = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals); 117 118 printf("> PTX JIT log:\n%s\n", jitLogBuffer); 119 } 120 else 121 status = cuModuleLoad(&cuModule, module_path.c_str()); 122 123 if (CUDA_SUCCESS != status) 124 goto Error; 125 126 // 取出函数 127 if (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) 128 status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit"); 129 else 130 status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_32bit"); 131 132 if (CUDA_SUCCESS != status) 133 goto Error; 134 *pMatrixMul = cuFunction; 135 return CUDA_SUCCESS; 136 137 Error: 138 cuCtxDestroy(cuContext); 139 return status; 140 } 141 142 void runTest(int argc, char **argv) 143 { 144 int block_size = 32; 145 146 // 获取计算函数 147 CUfunction matrixMul = NULL;// CUDA 函数指针 148 CUresult error_id = initCUDA(argc, argv, &matrixMul);// 获取函数 149 150 // 数据准备工作 151 unsigned int size_A = WA * HA; 152 unsigned int mem_size_A = sizeof(float) * size_A; 153 float *h_A = (float *) malloc(mem_size_A); 154 unsigned int size_B = WB * HB; 155 unsigned int mem_size_B = sizeof(float) * size_B; 156 float *h_B = (float *) malloc(mem_size_B); 157 size_t size_C = WC * HC; 158 size_t mem_size_C = sizeof(float) * size_C; 159 float *h_C = (float *)malloc(mem_size_C); 160 constantInit(h_A, size_A, 1.0f); // 全 1 阵 161 constantInit(h_B, size_B, 0.01f); // 全0.01 阵 162 163 // 如果是64位系统,则这里申请四块1G的显存占着,没啥用 164 CUdeviceptr d_Mem[4]; 165 if (use_64bit_memory_address) 166 { 167 unsigned int mem_size = 1024*1024*1024; 168 cuMemAlloc(&d_Mem[0], mem_size); 169 cuMemAlloc(&d_Mem[1], mem_size); 170 cuMemAlloc(&d_Mem[2], mem_size); 171 cuMemAlloc(&d_Mem[3], mem_size); 172 } 173 174 CUdeviceptr d_A; 175 cuMemAlloc(&d_A, mem_size_A); 176 CUdeviceptr d_B; 177 cuMemAlloc(&d_B, mem_size_B); 178 CUdeviceptr d_C; 179 cuMemAlloc(&d_C, mem_size_C); 180 cuMemcpyHtoD(d_A, h_A, mem_size_A); 181 cuMemcpyHtoD(d_B, h_B, mem_size_B); 182 183 // 计时相关 184 StopWatchInterface *timer = NULL; 185 sdkCreateTimer(&timer); 186 sdkStartTimer(&timer); 187 188 dim3 block(block_size, block_size, 1); 189 dim3 grid(WC / block_size, HC / block_size, 1); 190 191 // 两种方式调用 Driver API 192 if (1) 193 { 194 // 64位内存地址且显存足够大,使用 size_t 为尺寸格式,否则使用 int 为尺寸格式,其调用格式相同 195 if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)4*1024*1024*1024L)) 196 { 197 size_t Matrix_Width_A = (size_t)WA; 198 size_t Matrix_Width_B = (size_t)WB; 199 void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; 200 // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数 201 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 202 2 * block_size*block_size * sizeof(float), NULL, args, NULL); 203 } 204 else 205 { 206 int Matrix_Width_A = WA; 207 int Matrix_Width_B = WB; 208 void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; 209 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 210 2 * block_size*block_size * sizeof(float), NULL, args, NULL); 211 } 212 } 213 else 214 { 215 int offset = 0; 216 char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移 217 218 *((CUdeviceptr *)&argBuffer[offset]) = d_C; 219 offset += sizeof(d_C); 220 *((CUdeviceptr *)&argBuffer[offset]) = d_A; 221 offset += sizeof(d_A); 222 *((CUdeviceptr *)&argBuffer[offset]) = d_B; 223 offset += sizeof(d_B); 224 225 if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)4*1024*1024*1024L)) 226 { 227 size_t Matrix_Width_A = (size_t)WA; 228 size_t Matrix_Width_B = (size_t)WB; 229 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A; 230 offset += sizeof(Matrix_Width_A); 231 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B; 232 offset += sizeof(Matrix_Width_B); 233 } 234 else 235 { 236 int Matrix_Width_A = WA; 237 int Matrix_Width_B = WB; 238 *((int *)&argBuffer[offset]) = Matrix_Width_A; 239 offset += sizeof(Matrix_Width_A); 240 *((int *)&argBuffer[offset]) = Matrix_Width_B; 241 offset += sizeof(Matrix_Width_B); 242 } 243 244 // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏 245 void *kernel_launch_config[5] = 246 { 247 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, 248 CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, 249 CU_LAUNCH_PARAM_END 250 }; 251 252 // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数 253 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 254 2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config); 255 } 256 257 cuMemcpyDtoH((void *) h_C, d_C, mem_size_C); 258 259 sdkStopTimer(&timer); 260 printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); 261 sdkDeleteTimer(&timer); 262 263 //检查结果 264 printf("Checking computed result for correctness: "); 265 bool correct = true; 266 for (int i = 0; i < (int)(WC * HC); i++) 267 { 268 if (fabs(h_C[i] - (WA * 0.01f)) > 1e-5) 269 { 270 printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA*0.01f); 271 correct = false; 272 } 273 } 274 printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 275 printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); 276 277 if (use_64bit_memory_address) 278 { 279 cuMemFree(d_Mem[0]); 280 cuMemFree(d_Mem[1]); 281 cuMemFree(d_Mem[2]); 282 cuMemFree(d_Mem[3]); 283 } 284 free(h_A); 285 free(h_B); 286 free(h_C); 287 cuMemFree(d_A); 288 cuMemFree(d_B); 289 cuMemFree(d_C); 290 cuCtxDestroy(cuContext); 291 } 292 293 int main(int argc, char **argv) 294 { 295 printf("[ matrixMulDrv(Driver API) ]\n"); 296 runTest(argc, argv); 297 298 getchar(); 299 return 0; 300 }
▶ 输出结果:
[ matrixMulDrv (Driver API) ] > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Total amount of global memory: 8589934592 bytes 64-bit Memory Address: YES sdkFindFilePath <matrixMul_kernel64.ptx> in ./ sdkFindFilePath <matrixMul_kernel64.ptx> in ./../../bin/win64/Debug/matrixMulDrv_data_files/ sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/ sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/data/ sdkFindFilePath <matrixMul_kernel64.ptx> in ./data/ > findModulePath <./data/matrixMul_kernel64.ptx> > initCUDA loading module: <./data/matrixMul_kernel64.ptx> > PTX JIT log: Processing time: 0.568077 (ms) Checking computed result for correctness: Result = PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
▶ 涨姿势:
● 头文件 matrixMul.h 的内容:
1 #ifndef _MATRIXMUL_H_ 2 #define _MATRIXMUL_H_ 3 4 // 规定了参与计算的矩阵的维数 5 #define WA (4 * block_size) 6 #define HA (6 * block_size) 7 #define WB (4 * block_size) 8 #define HB WA 9 #define WC WB 10 #define HC HA 11 12 #endif // _MATRIXMUL_H_
● C++ 中 string 类的基本使用方法
1 using namespace std; 2 3 string buf, buf2; 4 int n; 5 char *buf = new char[n];// 动态创建字符数组大小,类似malloc 6 buf[n - 1] = '\0'; // 手动结尾补零 7 buf2 = buf; // 直接赋值 8 delete[] buf; // 删除该数组,类似 free
● class StopWatchInterface ,定义于 helper_timer.h 中用于计时的一个类,这里只说明其使用方法,其内容在头文件随笔中详细讨论。
1 StopWatchInterface *timer = NULL; // 创建计时类指针 2 sdkCreateTimer(&timer); // 创建计时类 3 sdkStartTimer(&timer); // 开始计时 4 5 ... // 核函数运行过程 6 7 sdkStopTimer(&timer); // 停止计时 8 sdkGetTimerValue(&timer); // 获取时间(返回浮点类型的毫秒数) 9 sdkDeleteTimer(&timer); // 删除计时类
● cuda.h 中各种定义
typedef int CUdevice; // CUDA int 类型,用于标志设备号 typedef struct CUfunc_st *CUfunction; // CUDA 函数指针 typedef struct CUmod_st *CUmodule; // CUDA 模块指针 typedef struct CUctx_st *CUcontext; // CUDA 上下文指针 typedef enum cudaError_enum {...}CUresult; // CUDA 各种错误信息标号 typedef unsigned long long CUdeviceptr; // 无符号长长整型 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);// 获取设备名称 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); // 获取设备计算能力 inline CUdevice findCudaDeviceDRV(int argc, const char **argv); // 依命令行指定设备,否则选择计算能力最高的设备。内含函数调用 cuInit(0) #define cuDeviceTotalMem cuDeviceTotalMem_v2 // 获取显存大小 CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); #define cuMemAlloc cuMemAlloc_v2 // 申请显存 CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); #define cuMemFree cuMemFree_v2 // 释放显存 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); CUresult CUDAAPI cuInit(unsigned int Flags); // 重要的初始化设备参数,在创建上下文之前要先调用它,参数可以设为 0 #define cuCtxCreate cuCtxCreate_v2 // 创建上下文 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); #define cuCtxDestroy cuCtxDestroy_v2 // 销毁上下文 CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) // cudaMemcpy(cudaMemcpyHostToDevice)的别名 #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) // cudaMemcpy(cudaMemcpyDeviceToHost)的别名 #define __CUDA_API_PTDS(api) api // 从 ptx 流 image 中编译模块 module,并且包括 numOptions 个参数,参数名列表为 options,参数值列表为 optionValues CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); // 指定路径 fname 中获取模块 module CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); // 从指定模块 hmod 中获取名为 name 的函赋给函数指针 hfunc CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
● 代码中使用了 goto 语句,基本使用过程如下。好处是函数整个函数 initCUDA 中只有一个 return,坏处是到处是跳转。
1 int function() 2 { 3 CUresult status; 4 5 status = cudaFunction(); 6 if (!status == CUDA_SUCCESS)// 函数cudaFunction运行不正常 7 goto Error; 8 9 ... // 函数运行正常 10 11 return 0; // 正常结束,返回 0 12 Error: 13 return status; // 非正常结束,返回首个错误编号 14 }
● Driver API 的简略使用过程。本篇源代码很长,但是压缩后可以变成以下内容,方便看出该接口函数的使用过程。
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <builtin_types.h> 4 #include <helper_cuda_drvapi.h> 5 #include <helper_timer.h> 6 7 int main() 8 { 9 // 常量 10 CUdevice cuDevice = 0; 11 CUcontext cuContext; 12 CUmodule cuModule; 13 CUfunction matrixMul = NULL; 14 CUresult status; 15 char module_path[30] = "./data/matrixMul_kernel64.ptx"; 16 char ptx_source[63894]; 17 18 // 创建上下文 19 cuInit(0); 20 status = cuCtxCreate(&cuContext, 0, cuDevice); 21 22 // 获取函数 23 FILE *fp = fopen(module_path, "rb"); 24 fseek(fp, 0, SEEK_END); 25 int file_size = ftell(fp); 26 fseek(fp, 0, SEEK_SET); 27 fread(ptx_source, sizeof(char), file_size, fp); 28 ptx_source[63894 - 1] = '\0'; 29 30 // 设置编译选项 31 const unsigned int jitNumOptions = 3; 32 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; 33 void **jitOptVals = new void *[jitNumOptions]; 34 35 // 编译日志大小 36 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 37 int jitLogBufferSize = 1024; 38 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; 39 40 // 编译日志的指针 41 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 42 char *jitLogBuffer = new char[jitLogBufferSize]; 43 jitOptVals[1] = jitLogBuffer; 44 45 // 单核函数寄存器数量 46 jitOptions[2] = CU_JIT_MAX_REGISTERS; 47 int jitRegCount = 32; 48 jitOptVals[2] = (void *)(size_t)jitRegCount; 49 50 // 编译模块 51 status = cuModuleLoadDataEx(&cuModule, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals); 52 printf("\nPTX JIT log:\n%s\n", jitLogBuffer); 53 status = cuModuleGetFunction(&matrixMul, cuModule, "matrixMul_bs32_64bit"); 54 55 // 数据准备工作 56 int block_size = 32; 57 int wa = 4 * block_size; 58 int ha = 6 * block_size; 59 int wb = 4 * block_size; 60 int hb = wa; 61 int wc = wb; 62 int hc = ha; 63 64 unsigned int size_A = wa * ha; 65 unsigned int mem_size_A = sizeof(float) * size_A; 66 float *h_A = (float *)malloc(mem_size_A); 67 unsigned int size_B = wb * hb; 68 unsigned int mem_size_B = sizeof(float) * size_B; 69 float *h_B = (float *)malloc(mem_size_B); 70 size_t size_C = wc * hc; 71 size_t mem_size_C = sizeof(float) * size_C; 72 float *h_C = (float *)malloc(mem_size_C); 73 74 for (int i = 0; i < size_A; ++i) 75 h_A[i] = 1.0f; 76 for (int i = 0; i < size_B; ++i) 77 h_B[i] = 0.01f; 78 79 CUdeviceptr d_A; 80 cuMemAlloc(&d_A, mem_size_A); 81 CUdeviceptr d_B; 82 cuMemAlloc(&d_B, mem_size_B); 83 CUdeviceptr d_C; 84 cuMemAlloc(&d_C, mem_size_C); 85 cuMemcpyHtoD(d_A, h_A, mem_size_A); 86 cuMemcpyHtoD(d_B, h_B, mem_size_B); 87 88 dim3 block(block_size, block_size, 1); 89 dim3 grid(wc / block_size, hc / block_size, 1); 90 91 // 两种方式调用 Driver API 92 if (1) 93 { 94 size_t Matrix_Width_A = (size_t)wa; 95 size_t Matrix_Width_B = (size_t)wb; 96 void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B }; 97 // CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数 98 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 99 2 * block_size*block_size * sizeof(float), NULL, args, NULL); 100 } 101 else 102 { 103 int offset = 0; 104 char argBuffer[256];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移 105 106 *((CUdeviceptr *)&argBuffer[offset]) = d_C; 107 offset += sizeof(d_C); 108 *((CUdeviceptr *)&argBuffer[offset]) = d_A; 109 offset += sizeof(d_A); 110 *((CUdeviceptr *)&argBuffer[offset]) = d_B; 111 offset += sizeof(d_B); 112 size_t Matrix_Width_A = (size_t)wa; 113 size_t Matrix_Width_B = (size_t)wb; 114 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A; 115 offset += sizeof(Matrix_Width_A); 116 *((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B; 117 offset += sizeof(Matrix_Width_B); 118 119 // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏 120 void *kernel_launch_config[5] = 121 { 122 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, 123 CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, 124 CU_LAUNCH_PARAM_END 125 }; 126 127 // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数 128 cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, 129 2 * block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config); 130 } 131 132 cuMemcpyDtoH((void *)h_C, d_C, mem_size_C); 133 134 //检查结果 135 printf("Checking computed result for correctness: "); 136 bool correct = true; 137 for (int i = 0; i < (int)(wc * hc); i++) 138 { 139 if (fabs(h_C[i] - (wa * 0.01f)) > 1e-5) 140 { 141 printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], wa*0.01f); 142 correct = false; 143 } 144 } 145 printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 146 147 free(h_A); 148 free(h_B); 149 free(h_C); 150 cuMemFree(d_A); 151 cuMemFree(d_B); 152 cuMemFree(d_C); 153 cuCtxDestroy(cuContext); 154 155 getchar(); 156 return 0; 157 }