爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

▶ 使用 CUDA Runtime API,运行时编译,Driver API 三种接口计算向量加法

▶ 源代码,CUDA Runtime API

 1 #include <stdio.h>
 2 #include <cuda_runtime.h>
 3 #include "device_launch_parameters.h"
 4 #include <helper_cuda.h>
 5 
 6 #define ELEMENT 50000
 7 
 8 __global__ void vectorAdd(const float *A, const float *B, float *C, int size)
 9 {
10     int i = blockDim.x * blockIdx.x + threadIdx.x;
11     if (i < size)
12         C[i] = A[i] + B[i];
13 }
14 
15 int main()
16 {
17     printf("\tStart.\n");    
18     size_t size = ELEMENT * sizeof(float);
19 
20     float *h_A = (float *)malloc(size);
21     float *h_B = (float *)malloc(size);
22     float *h_C = (float *)malloc(size);
23     float *d_A = NULL;
24     float *d_B = NULL;
25     float *d_C = NULL;
26     cudaMalloc((void **)&d_A, size);
27     cudaMalloc((void **)&d_B, size);
28     cudaMalloc((void **)&d_C, size);
29     for (int i = 0; i < ELEMENT; ++i)
30     {
31         h_A[i] = rand() / (float)RAND_MAX;
32         h_B[i] = rand() / (float)RAND_MAX;
33     }
34     cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
35     cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
36 
37     int threadsPerBlock = 256;
38     int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock;
39     vectorAdd << <blocksPerGrid, threadsPerBlock >> > (d_A, d_B, d_C, ELEMENT);
40     cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
41 
42     for (int i = 0; i < ELEMENT; ++i)
43     {
44         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
45         {
46             printf("\n\tResult error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f\n", i, h_A[i], h_B[i], h_C[i]);
47             getchar();
48             return 1;
49         }
50     }
51 
52     free(h_A);
53     free(h_B);
54     free(h_C);
55     cudaFree(d_A);
56     cudaFree(d_B);
57     cudaFree(d_C);
58     printf("\n\tFinish.\n");
59     getchar();    
60     return 0;
61 }

● 输出结果:

    Start.

    Finish.

 

▶ 源代码,运行时编译

1 // vectorAdd_kernel.cu
2 extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int size)
3 {
4     int i = blockDim.x * blockIdx.x + threadIdx.x;
5     if (i < size)
6         C[i] = A[i] + B[i];
7 }
 1 // vectorAdd.cpp
 2 #include <stdio.h>
 3 #include <cuda_runtime.h>
 4 #include "device_launch_parameters.h"
 5 #include <cuda.h>
 6 #include <nvrtc_helper.h>
 7 
 8 #define ELEMENT 50000
 9 
10 int main()
11 {
12     printf("\n\tStart.\n");
13 
14     char *ptx, *kernel_file;
15     size_t ptxSize;
16     kernel_file = "D:\\Program\\CUDA9.0\\Samples\\0_Simple\\vectorAdd_nvrtc\\vectorAdd_kernel.cu";
17     compileFileToPTX(kernel_file, 1, NULL, &ptx, &ptxSize, 0);
18     CUmodule module = loadPTX(ptx, 1, NULL);
19     CUfunction kernel_addr;
20     cuModuleGetFunction(&kernel_addr, module, "vectorAdd");
21 
22     size_t size = ELEMENT * sizeof(float);
23 
24     float *h_A = (float *)malloc(size);
25     float *h_B = (float *)malloc(size);
26     float *h_C = (float *)malloc(size);
27     CUdeviceptr d_A, d_B, d_C;
28     cuMemAlloc(&d_A, size);
29     cuMemAlloc(&d_B, size);
30     cuMemAlloc(&d_C, size);    
31     for (int i = 0; i < ELEMENT; ++i)
32     {
33         h_A[i] = rand()/(float)RAND_MAX;
34         h_B[i] = rand()/(float)RAND_MAX;
35     }
36     cuMemcpyHtoD(d_A, h_A, size);
37     cuMemcpyHtoD(d_B, h_B, size);
38 
39     int threadsPerBlock = 256; 
40     dim3 cudaBlockSize(threadsPerBlock,1,1);
41     dim3 cudaGridSize((ELEMENT + threadsPerBlock - 1) / threadsPerBlock, 1, 1);
42     int element = ELEMENT;
43     void *arr[] = { (void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&element};
44     cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, 0, 0, &arr[0], 0);
45     cuCtxSynchronize();
46     cuMemcpyDtoH(h_C, d_C, size);
47 
48     for (int i = 0; i < ELEMENT; ++i)
49     {
50         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
51         {
52             printf("\n\tResult error at i = %d, h_A[i] = %f, h_B[i] = %f, h_C[i] = %f\n", i, h_A[i], h_B[i], h_C[i]);
53             getchar();
54             return 1;
55         }
56     }
57 
58     free(h_A);
59     free(h_B);
60     free(h_C);
61     cuMemFree(d_A);
62     cuMemFree(d_B);
63     cuMemFree(d_C);
64     printf("\n\tFinish.\n");
65     getchar();
66     return 0;
67 }

● 输出结果:

        Start.
> Using CUDA Device [0]: GeForce GTX 1070
> GPU Device has SM 6.1 compute capability

        Finish.

 

▶ 源代码,Driver API,也需要上面的 vectorAdd_kernel.cu,调用核函数有三种方式,中间那种有点问题,结果不对

  1 #include <stdio.h>
  2 #include <helper_cuda.h>
  3 #include <cuda.h>
  4 #include <string>
  5 #include <drvapi_error_string.h>
  6 
  7 #define ELEMENT 50000
  8 #define PATH "C:\\ProgramData\\NVIDIA Corporation\\CUDA Samples\\v9.1\\0_Simple\\vectorAddDrv\\data\\"
  9 
 10 #if defined(_WIN64) || defined(__LP64__)
 11 #define PTX_FILE "vectorAdd_kernel64.ptx"
 12 #else
 13 #define PTX_FILE "vectorAdd_kernel32.ptx"
 14 #endif
 15 
 16 using namespace std;
 17 
 18 void RandomInit(float *data, int n)
 19 {
 20     for (int i = 0; i < n; ++i)
 21         data[i] = rand() / (float)RAND_MAX;
 22 }
 23 
 24 int main(int argc, char **argv)
 25 {
 26     printf("\n\tStart.\n");
 27     cuInit(0);// 相当于 runtime API 的 cudaSetDevice(0);,要先初始化设备才能创建上下文
 28     CUcontext cuContext;
 29     cuCtxCreate(&cuContext, 0, 0);
 30 
 31     // 编译
 32     string module_path, ptx_source;
 33     module_path = PATH"vectorAdd_kernel64.ptx";
 34     FILE *fp = fopen(module_path.c_str(), "rb");
 35     fseek(fp, 0, SEEK_END);
 36     int file_size = ftell(fp);
 37     char *buf = new char[file_size + 1];
 38     fseek(fp, 0, SEEK_SET);
 39     fread(buf, sizeof(char), file_size, fp);
 40     fclose(fp);
 41     buf[file_size] = '\0';
 42     ptx_source = buf;
 43     delete[] buf;
 44 
 45     CUmodule cuModule;
 46     if (module_path.rfind("ptx") != string::npos)// 使用的是.ptx,需要运行时编译
 47     {
 48         // 设定编译参数,CUjit_option 放置参数名,jitOptVals 放置参数值
 49         const unsigned int jitNumOptions = 3;
 50         CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
 51         void **jitOptVals = new void *[jitNumOptions];
 52         // 编译日志长度
 53         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 54         int jitLogBufferSize = 1024;
 55         jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
 56         // 编译日志内容
 57         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
 58         char *jitLogBuffer = new char[jitLogBufferSize];
 59         jitOptVals[1] = jitLogBuffer;
 60         // 设定一个内核使用的寄存器数量
 61         jitOptions[2] = CU_JIT_MAX_REGISTERS;
 62         int jitRegCount = 32;
 63         jitOptVals[2] = (void *)(size_t)jitRegCount;
 64         // 编译模块
 65         cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals);
 66         //printf("> PTX JIT log:\n%s\n", jitLogBuffer);// 输出编译日志
 67         delete[] jitLogBuffer;
 68         delete[] jitOptVals;
 69         delete[] jitOptions;
 70     }
 71     else// 使用的是 .cubin,不用编译(本例中不经过这个分支)
 72         cuModuleLoad(&cuModule, module_path.c_str());
 73 
 74     CUfunction vecAdd_kernel;
 75     cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel");// 取出编译好的模块中的函数
 76 
 77     // 申请内存,开始运算
 78     int element = ELEMENT;
 79     size_t  size = ELEMENT * sizeof(float);
 80     float * h_A, *h_B, *h_C;
 81     CUdeviceptr d_A, d_B, d_C;
 82     h_A = (float *)malloc(size);
 83     h_B = (float *)malloc(size);
 84     h_C = (float *)malloc(size);
 85     RandomInit(h_A, ELEMENT);
 86     RandomInit(h_B, ELEMENT);
 87     cuMemAlloc(&d_A, size);
 88     cuMemAlloc(&d_B, size);
 89     cuMemAlloc(&d_C, size);
 90     cuMemcpyHtoD(d_A, h_A, size);
 91     cuMemcpyHtoD(d_B, h_B, size);
 92 
 93     int threadsPerBlock = 256;
 94     int blocksPerGrid = (ELEMENT + threadsPerBlock - 1) / threadsPerBlock;
 95     if (1)      // 三种调用 Driver API 的方式
 96     {
 97         void *args[] = { &d_A, &d_B, &d_C, &element };
 98         cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL);
 99     }
100     else if (1) // 有问题
101     {
102         int offset = 0;
103         void *argBuffer[64];
104         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
105         offset += sizeof(d_A);
106         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
107         offset += sizeof(d_B);
108         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
109         offset += sizeof(d_C);
110         *((int *)&argBuffer[offset]) = element;
111         offset += sizeof(element);
112         cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer);
113     }
114     else        // 正确的
115     {
116         int offset = 0;
117         char argBuffer[256];
118         *((CUdeviceptr *)&argBuffer[offset]) = d_A;
119         offset += sizeof(d_A);
120         *((CUdeviceptr *)&argBuffer[offset]) = d_B;
121         offset += sizeof(d_B);
122         *((CUdeviceptr *)&argBuffer[offset]) = d_C;
123         offset += sizeof(d_C);
124         *((int *)&argBuffer[offset]) = element;
125         offset += sizeof(element);
126         void *kernel_launch_config[5] =
127         { CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,CU_LAUNCH_PARAM_BUFFER_SIZE,&offset,CU_LAUNCH_PARAM_END };
128         cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, (void **)&kernel_launch_config);
129     }
130     cuCtxSynchronize();
131     cuMemcpyDtoH(h_C, d_C, size);
132     int i;
133     for (i = 0; i < ELEMENT; ++i)
134     {
135         float sum = h_A[i] + h_B[i];
136         if (fabs(h_C[i] - sum) > 1e-7f)
137         {
138             printf("Error at i == %d, h_C[i] == %f, sum == %f", i, h_C[i], sum);
139             break;
140         }            
141     }
142     printf("\n\tFinish: %s\n", (i == ELEMENT) ? "Pass" : "Fail");
143     getchar();
144     return 0;
145 }

● 输出结果

    Start.

    Finish.

 

▶ 涨姿势:

● 从源代码中删减了的部分

 1 CUresult CleanupNoFailure() //检查内存错误的函数
 2 {
 3     CUresult error;
 4     // Free device memory
 5     if (d_A)    
 6         error = cuMemFree(d_A);
 7     if (d_B)
 8         error = cuMemFree(d_B);
 9     if (d_C)
10         error = cuMemFree(d_C);
11     // Free host memory
12     if (h_A)
13         free(h_A);
14     if (h_B)
15         free(h_B);
16     if (h_C)
17         free(h_C);
18     error = cuCtxDestroy(cuContext);
19     return error;
20 }
21 
22 void Cleanup(bool noError)  // 报告错误
23 {
24     CUresult error = CleanupNoFailure();
25     if (!noError || error != CUDA_SUCCESS)
26     {
27         printf("Function call failed\nFAILED\n");
28         exit(EXIT_FAILURE);
29     }
30     if (!noprompt)
31     {
32         printf("\nPress ENTER to exit...\n");
33         fflush(stdout);
34         fflush(stderr);
35         getchar();
36     }
37 }
38                           
39 if (error != CUDA_SUCCESS)  // 外部调用 cleanup
40     Cleanup(false);
41 
42 if (argc > 1)               // 主函数中使用参数 -device=n 指定设备号
43 {
44     bool bFound = false;
45     for (int param = 0; param < argc; param++)          // 逐个检查参数
46     {
47         int string_start = 0;
48         while (argv[param][string_start] == '-')        // 跳过 "-" 号
49             string_start++;
50         char *string_argv = &argv[param][string_start];
51         if (!strncmp(string_argv, "device", 6))         // 看参数是否是 device
52         {
53             int len = (int)strlen(string_argv);
54             while (string_argv[len] != '=')
55                 len--;
56             devID = atoi(&string_argv[++len]);
57             bFound = true;
58         }
59         if (bFound)
60             break;
61     }
62 }

 

posted on 2017-12-09 19:51  爨爨爨好  阅读(416)  评论(0编辑  收藏  举报