▶ 使用 CUDA Runtime API,运行时编译,Driver API 三种接口计算向量加法
▶ 源代码,CUDA Runtime API
1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 #include <helper_cuda.h> 5 6 #define ELEMENT 50000 7 8 __global__ void vectorAdd(const float *A, const float *B, float *C, int size) 9 { 10 int i = blockDim.x * blockIdx.x + threadIdx.x; 11 if (i < size) 12 C[i] = A[i] + B[i]; 13 } 14 15 int main() 16 { 17 printf("\tStart.\n"); 18 size_t size = ELEMENT * sizeof(float); 19 20 float *h_A = (float *)malloc(size); 21 float *h_B = (float *)malloc(size); 22 float *h_C = (float *)malloc(size); 23 float *d_A = NULL; 24 float *d_B = NULL; 25 float *d_C = NULL; 26 cudaMalloc((void **)&d_A, size); 27 cudaMalloc((void **)&d_B, size); 28 cudaMalloc((void **)&d_C, size); 29 for (int i = 0; i < ELEMENT; ++i) 30 { 31 h_A[i] = rand() / (float)RAND_MAX; 32 h_B[i] = rand() / (float)RAND_MAX; 33 } 34 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 35 cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 36 37 int threadsPerBlock = 256; 38 int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock; 39 vectorAdd << <blocksPerGrid, threadsPerBlock >> > (d_A, d_B, d_C, ELEMENT); 40 cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 41 42 for (int i = 0; i < ELEMENT; ++i) 43 { 44 if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 45 { 46 printf("\n\tResult error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f\n", i, h_A[i], h_B[i], h_C[i]); 47 getchar(); 48 return 1; 49 } 50 } 51 52 free(h_A); 53 free(h_B); 54 free(h_C); 55 cudaFree(d_A); 56 cudaFree(d_B); 57 cudaFree(d_C); 58 printf("\n\tFinish.\n"); 59 getchar(); 60 return 0; 61 }
● 输出结果:
Start.
Finish.
▶ 源代码,运行时编译
1 // vectorAdd_kernel.cu 2 extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int size) 3 { 4 int i = blockDim.x * blockIdx.x + threadIdx.x; 5 if (i < size) 6 C[i] = A[i] + B[i]; 7 }
1 // vectorAdd.cpp 2 #include <stdio.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <cuda.h> 6 #include <nvrtc_helper.h> 7 8 #define ELEMENT 50000 9 10 int main() 11 { 12 printf("\n\tStart.\n"); 13 14 char *ptx, *kernel_file; 15 size_t ptxSize; 16 kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\vectorAdd_nvrtc\\vectorAdd_kernel.cu"; 17 compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0); 18 CUmodule module = loadPTX(ptx, 1, NULL); 19 CUfunction kernel_addr; 20 cuModuleGetFunction(&kernel_addr, module, "vectorAdd"); 21 22 size_t size = ELEMENT * sizeof(float); 23 24 float *h_A = (float *)malloc(size); 25 float *h_B = (float *)malloc(size); 26 float *h_C = (float *)malloc(size); 27 CUdeviceptr d_A, d_B, d_C; 28 cuMemAlloc(&d_A, size); 29 cuMemAlloc(&d_B, size); 30 cuMemAlloc(&d_C, size); 31 for (int i = 0; i < ELEMENT; ++i) 32 { 33 h_A[i] = rand()/(float)RAND_MAX; 34 h_B[i] = rand()/(float)RAND_MAX; 35 } 36 cuMemcpyHtoD(d_A, h_A, size); 37 cuMemcpyHtoD(d_B, h_B, size); 38 39 int threadsPerBlock = 256; 40 dim3 cudaBlockSize(threadsPerBlock,1,1); 41 dim3 cudaGridSize((ELEMENT + threadsPerBlock - 1) / threadsPerBlock, 1, 1); 42 int element = ELEMENT; 43 void *arr[] = { (void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&element}; 44 cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &arr[0], 0); 45 cuCtxSynchronize(); 46 cuMemcpyDtoH(h_C, d_C, size); 47 48 for (int i = 0; i < ELEMENT; ++i) 49 { 50 if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 51 { 52 printf("\n\tResult error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f\n", i, h_A[i], h_B[i], h_C[i]); 53 getchar(); 54 return 1; 55 } 56 } 57 58 free(h_A); 59 free(h_B); 60 free(h_C); 61 cuMemFree(d_A); 62 cuMemFree(d_B); 63 cuMemFree(d_C); 64 printf("\n\tFinish.\n"); 65 getchar(); 66 return 0; 67 }
● 输出结果:
Start. > Using CUDA Device [0]: GeForce GTX 1070 > GPU Device has SM 6.1 compute capability Finish.
▶ 源代码,Driver API,也需要上面的 vectorAdd_kernel.cu,调用核函数有三种方式,中间那种有点问题,结果不对
1 #include <stdio.h> 2 #include <helper_cuda.h> 3 #include <cuda.h> 4 #include <string> 5 #include <drvapi_error_string.h> 6 7 #define ELEMENT 50000 8 #define PATH "C:\\ProgramData\\NVIDIA Corporation\\CUDA Samples\\v9.1\\0_Simple\\vectorAddDrv\\data\\" 9 10 #if defined(_WIN64) || defined(__LP64__) 11 #define PTX_FILE "vectorAdd_kernel64.ptx" 12 #else 13 #define PTX_FILE "vectorAdd_kernel32.ptx" 14 #endif 15 16 using namespace std; 17 18 void RandomInit(float *data, int n) 19 { 20 for (int i = 0; i < n; ++i) 21 data[i] = rand() / (float)RAND_MAX; 22 } 23 24 int main(int argc, char **argv) 25 { 26 printf("\n\tStart.\n"); 27 cuInit(0);// 相当于 runtime API 的 cudaSetDevice(0);,要先初始化设备才能创建上下文 28 CUcontext cuContext; 29 cuCtxCreate(&cuContext, 0, 0); 30 31 // 编译 32 string module_path, ptx_source; 33 module_path = PATH"vectorAdd_kernel64.ptx"; 34 FILE *fp = fopen(module_path.c_str(), "rb"); 35 fseek(fp, 0, SEEK_END); 36 int file_size = ftell(fp); 37 char *buf = new char[file_size + 1]; 38 fseek(fp, 0, SEEK_SET); 39 fread(buf, sizeof(char), file_size, fp); 40 fclose(fp); 41 buf[file_size] = '\0'; 42 ptx_source = buf; 43 delete[] buf; 44 45 CUmodule cuModule; 46 if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译 47 { 48 // 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值 49 const unsigned int jitNumOptions = 3; 50 CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; 51 void **jitOptVals = new void *[jitNumOptions]; 52 // 编译日志长度 53 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 54 int jitLogBufferSize = 1024; 55 jitOptVals[0] = (void *)(size_t)jitLogBufferSize; 56 // 编译日志内容 57 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 58 char *jitLogBuffer = new char[jitLogBufferSize]; 59 jitOptVals[1] = jitLogBuffer; 60 // 设定一个内核使用的寄存器数量 61 jitOptions[2] = CU_JIT_MAX_REGISTERS; 62 int jitRegCount = 32; 63 jitOptVals[2] = (void *)(size_t)jitRegCount; 64 // 编译模块 65 cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals); 66 //printf("> PTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志 67 delete[] jitLogBuffer; 68 delete[] jitOptVals; 69 delete[] jitOptions; 70 } 71 else// 使用的是 .cubin,不用编译(本例中不经过这个分支) 72 cuModuleLoad(&cuModule, module_path.c_str()); 73 74 CUfunction vecAdd_kernel; 75 cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel");// 取出编译好的模块中的函数 76 77 // 申请内存,开始运算 78 int element = ELEMENT; 79 size_t size = ELEMENT * sizeof(float); 80 float * h_A, *h_B, *h_C; 81 CUdeviceptr d_A, d_B, d_C; 82 h_A = (float *)malloc(size); 83 h_B = (float *)malloc(size); 84 h_C = (float *)malloc(size); 85 RandomInit(h_A, ELEMENT); 86 RandomInit(h_B, ELEMENT); 87 cuMemAlloc(&d_A, size); 88 cuMemAlloc(&d_B, size); 89 cuMemAlloc(&d_C, size); 90 cuMemcpyHtoD(d_A, h_A, size); 91 cuMemcpyHtoD(d_B, h_B, size); 92 93 int threadsPerBlock = 256; 94 int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock; 95 if (1) // 三种调用 Driver API 的方式 96 { 97 void *args[] = { &d_A, &d_B, &d_C, &element }; 98 cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL); 99 } 100 else if (1) // 有问题 101 { 102 int offset = 0; 103 void *argBuffer[64]; 104 *((CUdeviceptr *)&argBuffer[offset]) = d_A; 105 offset += sizeof(d_A); 106 *((CUdeviceptr *)&argBuffer[offset]) = d_B; 107 offset += sizeof(d_B); 108 *((CUdeviceptr *)&argBuffer[offset]) = d_C; 109 offset += sizeof(d_C); 110 *((int *)&argBuffer[offset]) = element; 111 offset += sizeof(element); 112 cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer); 113 } 114 else // 正确的 115 { 116 int offset = 0; 117 char argBuffer[256]; 118 *((CUdeviceptr *)&argBuffer[offset]) = d_A; 119 offset += sizeof(d_A); 120 *((CUdeviceptr *)&argBuffer[offset]) = d_B; 121 offset += sizeof(d_B); 122 *((CUdeviceptr *)&argBuffer[offset]) = d_C; 123 offset += sizeof(d_C); 124 *((int *)&argBuffer[offset]) = element; 125 offset += sizeof(element); 126 void *kernel_launch_config[5] = 127 { CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,CU_LAUNCH_PARAM_BUFFER_SIZE,&offset,CU_LAUNCH_PARAM_END }; 128 cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, (void **)&kernel_launch_config); 129 } 130 cuCtxSynchronize(); 131 cuMemcpyDtoH(h_C, d_C, size); 132 int i; 133 for (i = 0; i < ELEMENT; ++i) 134 { 135 float sum = h_A[i] + h_B[i]; 136 if (fabs(h_C[i] - sum) > 1e-7f) 137 { 138 printf("Error at i == %d, h_C[i] == %f, sum == %f", i, h_C[i], sum); 139 break; 140 } 141 } 142 printf("\n\tFinish: %s\n", (i == ELEMENT) ? "Pass" : "Fail"); 143 getchar(); 144 return 0; 145 }
● 输出结果
Start.
Finish.
▶ 涨姿势:
● 从源代码中删减了的部分
1 CUresult CleanupNoFailure() //检查内存错误的函数 2 { 3 CUresult error; 4 // Free device memory 5 if (d_A) 6 error = cuMemFree(d_A); 7 if (d_B) 8 error = cuMemFree(d_B); 9 if (d_C) 10 error = cuMemFree(d_C); 11 // Free host memory 12 if (h_A) 13 free(h_A); 14 if (h_B) 15 free(h_B); 16 if (h_C) 17 free(h_C); 18 error = cuCtxDestroy(cuContext); 19 return error; 20 } 21 22 void Cleanup(bool noError) // 报告错误 23 { 24 CUresult error = CleanupNoFailure(); 25 if (!noError || error != CUDA_SUCCESS) 26 { 27 printf("Function call failed\nFAILED\n"); 28 exit(EXIT_FAILURE); 29 } 30 if (!noprompt) 31 { 32 printf("\nPress ENTER to exit...\n"); 33 fflush(stdout); 34 fflush(stderr); 35 getchar(); 36 } 37 } 38 39 if (error != CUDA_SUCCESS) // 外部调用 cleanup 40 Cleanup(false); 41 42 if (argc > 1) // 主函数中使用参数 -device=n 指定设备号 43 { 44 bool bFound = false; 45 for (int param = 0; param < argc; param++) // 逐个检查参数 46 { 47 int string_start = 0; 48 while (argv[param][string_start] == '-') // 跳过 "-" 号 49 string_start++; 50 char *string_argv = &argv[param][string_start]; 51 if (!strncmp(string_argv, "device", 6)) // 看参数是否是 device 52 { 53 int len = (int)strlen(string_argv); 54 while (string_argv[len] != '=') 55 len--; 56 devID = atoi(&string_argv[++len]); 57 bFound = true; 58 } 59 if (bFound) 60 break; 61 } 62 }