OpenCL流程分析与示例

OpenCL流程分析与示例

 

 

 OpenCL示例

Vortex存储库的tests/OpenCL目录中有OpenCL测试程序。OpenCL程序分为主机代码和设备代码。[cc|cpp]和kernel.cl。

OpenCL通过在设备端并行执行内核来加快速度。来看看tests/opencl/sgemm中的代码作为一个具体的例子。注意,代码经过了轻微修改,使差异更加清晰。

下面显示了主机上运行的main.cc中的matmul函数。

void matmul(const float* A,
            const float* B,
            float*       C,
            int          N) {
  for (int i = 0; i < N; ++i) {
    for (int j = 0; j < N; ++j) {
      float acc = 0.0f;
      for (int k = 0; k < N; ++k) {
        acc += A[i + k * N] * B[k + j * N];
      }
      C[i + j * N] = acc;
    }
  }
}

下面是在与上述功能相对应的设备上运行的kernel.cl。

__kernel void sgemm(__global const float* A,
                    __global const float* B,
                    __global float*       C,
                    int                   N) {
  const int i = get_global_id(0);
  const int j = get_global_id(1);
  float acc = 0.0f;
  for (int k = 0; k < N; ++k) {
    acc += A[i + k * N] * B[k + j * N];
  }
  C[i + j * N] = acc;
}

 

sudo apt-get install ocl-icd-opencl-dev

c. 按流程编码

//

// Created by yang on 24-2-2.

//

#include <CL/cl.h>

#include <stdio.h>

#include <stdlib.h>

#define ARRAY_SIZE 1024

// OpenCL kernel

const char* kernelSource =

        "__kernel void vectorAdd(__global const float* a, __global const float* b, __global float* result) {\n"

        "    int index = get_global_id(0);\n"

        "    result[index] = a[index] + b[index];\n"

        "}\n";

int main() {

    // Initialize input vectors

    float a[ARRAY_SIZE];

    float b[ARRAY_SIZE];

    float result[ARRAY_SIZE];

    for (int i = 0; i < ARRAY_SIZE; ++i) {

        a[i] = i;

        b[i] = i * 2;

    }

    // Load OpenCL platform

    cl_platform_id platform;

    clGetPlatformIDs(1, &platform, NULL);

    // Load OpenCL device

    cl_device_id device;

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    // Create OpenCL context

    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

    // Create command queue

    cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);

    // Create OpenCL program

    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, NULL);

    // Build OpenCL program

    clBuildProgram(program, 1, &device, NULL, NULL, NULL);

    // Create OpenCL kernel

    cl_kernel kernel = clCreateKernel(program, "vectorAdd", NULL);

    // Create OpenCL buffers

    cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, a, NULL);

    cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ARRAY_SIZE, b, NULL);

    cl_mem bufferResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * ARRAY_SIZE, NULL, NULL);

    // Set OpenCL kernel arguments

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);

    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);

    clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferResult);

    // Execute OpenCL kernel

    size_t globalSize = ARRAY_SIZE;

    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);

    clFinish(queue);

    // Read the result from OpenCL buffer

    clEnqueueReadBuffer(queue, bufferResult, CL_TRUE, 0, sizeof(float) * ARRAY_SIZE, result, 0, NULL, NULL);

    // Display the result

    for (int i = 0; i < ARRAY_SIZE; ++i) {

        printf("%f + %f = %f\n", a[i], b[i], result[i]);

    }

    // Clean up

    clReleaseMemObject(bufferA);

    clReleaseMemObject(bufferB);

    clReleaseMemObject(bufferResult);

    clReleaseKernel(kernel);

    clReleaseProgram(program);

    clReleaseCommandQueue(queue);

    clReleaseContext(context);

    return 0;

}

d. 编译
gcc -O hello_opencl hello_cl.c -lOpenCL

e. 执行
./hello_opencl

Demo示例

在vortex 下编写和运行OpenCL内核代码和程序(vecadd demo)

http://main.cc代码如下:

#include <stdio.h>

#include <stdlib.h>

#include <assert.h>

#include <math.h>

#include <CL/opencl.h>

#include <unistd.h>

#include <string.h>

#include <chrono>

#define KERNEL_NAME "vecadd"

#define CL_CHECK(_expr)                                                \

   do {                                                                \

     cl_int _err = _expr;                                              \

     if (_err == CL_SUCCESS)                                           \

       break;                                                          \

     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \

    cleanup();                                                               \

     exit(-1);                                                         \

   } while (0)

#define CL_CHECK2(_expr)                                               \

   ({                                                                  \

     cl_int _err = CL_INVALID_VALUE;                                   \

     decltype(_expr) _ret = _expr;                                     \

     if (_err != CL_SUCCESS) {                                         \

       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \

      cleanup();                                                           \

       exit(-1);                                                       \

     }                                                                 \

     _ret;                                                             \

   })

static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {

  if (nullptr == filename || nullptr == data || 0 == size)

    return -1;

  FILE* fp = fopen(filename, "r");

  if (NULL == fp) {

    fprintf(stderr, "Failed to load kernel.");

    return -1;

  }

  fseek(fp , 0 , SEEK_END);

  long fsize = ftell(fp);

  rewind(fp);

  *data = (uint8_t*)malloc(fsize);

  *size = fread(*data, 1, fsize, fp);

 

  fclose(fp);

 

  return 0;

}

static bool almost_equal(float a, float b, int ulp = 4) {

  union fi_t { int i; float f; };

  fi_t fa, fb;

  fa.f = a;

  fb.f = b;

  return std::abs(fa.i - fb.i) <= ulp;

}

cl_device_id device_id = NULL;

cl_context context = NULL;

cl_command_queue commandQueue = NULL;

cl_program program = NULL;

cl_kernel kernel = NULL;

cl_mem a_memobj = NULL;

cl_mem b_memobj = NULL;

cl_mem c_memobj = NULL; 

float *h_a = NULL;

float *h_b = NULL;

float *h_c = NULL;

uint8_t *kernel_bin = NULL;

static void cleanup() {

  if (commandQueue) clReleaseCommandQueue(commandQueue);

  if (kernel) clReleaseKernel(kernel);

  if (program) clReleaseProgram(program);

  if (a_memobj) clReleaseMemObject(a_memobj);

  if (b_memobj) clReleaseMemObject(b_memobj);

  if (c_memobj) clReleaseMemObject(c_memobj); 

  if (context) clReleaseContext(context);

  if (device_id) clReleaseDevice(device_id);

 

  if (kernel_bin) free(kernel_bin);

  if (h_a) free(h_a);

  if (h_b) free(h_b);

  if (h_c) free(h_c);

}

int size = 64;

static void show_usage() {

  printf("Usage: [-n size] [-h: help]\n");

}

static void parse_args(int argc, char **argv) {

  int c;

  while ((c = getopt(argc, argv, "n:h?")) != -1) {

    switch (c) {

    case 'n':

      size = atoi(optarg);

      break;

    case 'h':

    case '?': {

      show_usage();

      exit(0);

    } break;

    default:

      show_usage();

      exit(-1);

    }

  }

  printf("Workload size=%d\n", size);

}

int main (int argc, char **argv) {

  // parse command arguments

  parse_args(argc, argv);

 

  cl_platform_id platform_id;

  size_t kernel_size;

 

  // Getting platform and device information

  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));

  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));

  printf("Create context\n");

  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));

  printf("Allocate device buffers\n");

  size_t nbytes = size * sizeof(float);

  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));

  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));

  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");

#ifdef HOSTGPU

  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))

    return -1;

  program = CL_CHECK2(clCreateProgramWithSource(

    context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); 

#else

  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))

    return -1;

  program = CL_CHECK2(clCreateProgramWithBinary(

    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));

#endif

  // Build program

  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

 

  // Create kernel

  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

  // Set kernel arguments

  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj)); 

  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj)); 

  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));

  // Allocate memories for input arrays and output arrays.   

  h_a = (float*)malloc(nbytes);

  h_b = (float*)malloc(nbytes);

  h_c = (float*)malloc(nbytes);   

  

  // Generate input values

  for (int i = 0; i < size; ++i) {

    h_a[i] = sinf(i)*sinf(i);

    h_b[i] = cosf(i)*cosf(i);

  }

  // Creating command queue

  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); 

   printf("Upload source buffers\n");

  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a, 0, NULL, NULL));

  CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));

  printf("Execute the kernel\n");

  size_t global_work_size[1] = {size};

  auto time_start = std::chrono::high_resolution_clock::now();

  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL));

  CL_CHECK(clFinish(commandQueue));

  auto time_end = std::chrono::high_resolution_clock::now();

  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();

  printf("Elapsed time: %lg ms\n", elapsed);

  printf("Download destination buffer\n");

  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));

  printf("Verify result\n");

  int errors = 0;

  for (int i = 0; i < size; ++i) {

    float ref = h_a[i] + h_b[i];

    if (!almost_equal(h_c[i], ref)) {

      if (errors < 100)

        printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);

      ++errors;

    }

  }

  if (0 == errors) {

    printf("PASSED!\n");

  } else {

    printf("FAILED! - %d errors\n", errors);   

  }

  // Clean up    

  cleanup(); 

  return errors;

}

openCL内核代码如下:

__kernel void vecadd (__global const float *A,

                       __global const float *B,

                       __global float *C)

{

  int gid = get_global_id(0);

  C[gid] = A[gid] + B[gid];

}

 

参考文献链接

https://www.luffca.com/2023/03/riscv-gpgpu-vortex-part2/

https://zhuanlan.zhihu.com/p/681397034

posted @   吴建明wujianming  阅读(75)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
历史上的今天:
2023-06-25 Cpu0算术运算指令和逻辑运算指令
2022-06-25 音视频与CPU架构
2021-06-25 激光雷达Lidar多制式产品
2020-06-25 红外红外传感器电路图及工作原理
2020-06-25 PCB的IPC标准是什么
2020-06-25 使用Keil语言的嵌入式C编程教程(下)
2020-06-25 使用Keil语言的嵌入式C编程教程(上)
点击右上角即可分享
微信分享提示