数据并行和任务并行

OpenCL并行加减乘除示例——数据并行与任务并行

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/zhouxuanyuye/article/details/79949409

OpenCL并行加减乘除示例——数据并行与任务并行

 ==============================================================

目录结构

1、数据并行

2、任务并行

3、参考

 ==============================================================

 

关键词:OpenCL; data parallel; task parallel

数据并行化计算与任务并行化分解可以加快程序的运行速度。

如下基本算术例子,输入数组A和数组B,得到输出数组C,C的结果如图中output所示。

图1、加减乘除例子

我们可以通过以下代码计算结果,这块代码我们暂且称为功能函数:

  1.  
    float C[16];
  2.  
     
  3.  
    int i;
  4.  
     
  5.  
    for(i=0; i<4; i++)
  6.  
     
  7.  
    {
  8.  
     
  9.  
    C[i*4+0] = A[i*4+0] + B[i*4+0]; //task A
  10.  
     
  11.  
    C[i*4+1] = A[i*4+1] - B[i*4+1];//task B
  12.  
     
  13.  
    C[i*4+2] = A[i*4+2] * B[i*4+2];//task C
  14.  
     
  15.  
    C[i*4+3] = A[i*4+3] / B[i*4+3];// task D
  16.  
     
  17.  
    }

1、数据并行(data parallel)

可以发现每一个for循环都由加减乘除4个任务组成,分别为task A、task B、task C和task D。按时间顺序从0时刻开始执行i=0到i=3的4个计算单元,运行完成时间假设为T。

图2. 顺序执行图

从图2我们也可以看出,对于每个程序块,A,B的数据来源都不同,图中的颜色对应task的颜色,由于数据之间并没有依赖关系,所以在程序设计时可以使i=0,1,2,3四个程序块一起运行,将不同的数据给相同的处理函数同时运行,理想化得使运行时间缩减到T/4,如图3所示。这种办法对不同的数据使用相同的核函数,称为数据并行。

 

图3. 数据并行方法图

数据化并行使用的OpenCL的API函数是:clEnqueueNDRangeKernel()

以下是参考程序:

host.cpp:

  1.  
    #include "stdafx.h"
  2.  
    #include <stdio.h>
  3.  
    #include <stdlib.h>
  4.  
    #include <string>
  5.  
    #include <CL/cl.h>
  6.  
    #include <time.h>
  7.  
     
  8.  
    #define MAX_SOURCE_SIZE (0x100000)
  9.  
    //data parallel
  10.  
    int main()
  11.  
    {
  12.  
    cl_platform_id platform_id = NULL;
  13.  
    cl_device_id device_id = NULL;
  14.  
    cl_context context = NULL;
  15.  
    cl_command_queue command_queue = NULL;
  16.  
    cl_mem Amobj = NULL;
  17.  
    cl_mem Bmobj = NULL;
  18.  
    cl_mem Cmobj = NULL;
  19.  
    cl_program program = NULL;
  20.  
    cl_kernel kernel = NULL;
  21.  
    cl_uint ret_num_devices;
  22.  
    cl_uint ret_num_platforms;
  23.  
    cl_int ret;
  24.  
     
  25.  
    int i, j;
  26.  
    float *A;
  27.  
    float *B;
  28.  
    float *C;
  29.  
     
  30.  
    A = (float *)malloc(4 * 4 * sizeof(float));
  31.  
    B = (float *)malloc(4 * 4 * sizeof(float));
  32.  
    C = (float *)malloc(4 * 4 * sizeof(float));
  33.  
     
  34.  
    FILE *fp;
  35.  
    const char fileName[] = "./dataParallel.cl";
  36.  
    size_t source_size;
  37.  
    char *source_str;
  38.  
     
  39.  
    /* Load kernel source file */
  40.  
    fp = fopen(fileName, "r");
  41.  
    if (!fp) {
  42.  
    fprintf(stderr, "Failed to load kernel.гдn");
  43.  
    exit(1);
  44.  
    }
  45.  
    source_str = (char *)malloc(MAX_SOURCE_SIZE);
  46.  
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  47.  
    fclose(fp);
  48.  
     
  49.  
    /* Initialize input data */
  50.  
    printf("Initialize input data");
  51.  
    for (i = 0; i < 4; i++) {
  52.  
    for (j = 0; j < 4; j++) {
  53.  
    A[i * 4 + j] = i * 4 + j + 1;
  54.  
    B[i * 4 + j] = j * 4 + i + 1;
  55.  
    }
  56.  
    }
  57.  
    printf("\n");
  58.  
     
  59.  
    printf("A array data:\n");
  60.  
    for (i = 0; i < 4; i++) {
  61.  
    for (int j=0; j<4; j++){
  62.  
    printf("%.2f\t",A[i*4+j]);
  63.  
    }
  64.  
    printf("\n");
  65.  
    }
  66.  
     
  67.  
    printf("B array data:\n");
  68.  
    for (i = 0; i < 4; i++) {
  69.  
    for (int j=0; j<4; j++){
  70.  
    printf("%.2f\t",B[i*4+j]);
  71.  
    }
  72.  
    printf("\n");
  73.  
    }
  74.  
     
  75.  
    clock_t start, finish;
  76.  
    double duration;
  77.  
    printf("DataParallel kernels tart to execute\n");
  78.  
    start = clock();
  79.  
     
  80.  
    /* Get Platform/Device Information */
  81.  
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  82.  
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
  83.  
    &ret_num_devices);
  84.  
     
  85.  
    /* Create OpenCL Context */
  86.  
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
  87.  
     
  88.  
    /* Create command queue */
  89.  
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
  90.  
     
  91.  
    /* Create Buffer Object */
  92.  
    Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4 * 4 * sizeof(float), NULL,
  93.  
    &ret);
  94.  
    Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4 * 4 * sizeof(float), NULL,
  95.  
    &ret);
  96.  
    Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4 * 4 * sizeof(float), NULL,
  97.  
    &ret);
  98.  
     
  99.  
    /* Copy input data to the memory buffer */
  100.  
    ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4 * 4 * sizeof(float),
  101.  
    A, 0, NULL, NULL);
  102.  
    ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4 * 4 * sizeof(float),
  103.  
    B, 0, NULL, NULL);
  104.  
    /* Create kernel program from source file*/
  105.  
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const
  106.  
    size_t *)&source_size, &ret);
  107.  
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  108.  
     
  109.  
    /* Create data parallel OpenCL kernel */
  110.  
    kernel = clCreateKernel(program, "dataParallel", &ret);
  111.  
     
  112.  
    /* Set OpenCL kernel arguments */
  113.  
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&Amobj);
  114.  
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&Bmobj);
  115.  
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&Cmobj);
  116.  
     
  117.  
    size_t global_item_size = 4;
  118.  
    size_t local_item_size = 1;
  119.  
     
  120.  
    /* Execute OpenCL kernel as data parallel */
  121.  
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
  122.  
    &global_item_size, &local_item_size, 0, NULL, NULL);
  123.  
     
  124.  
    /* Transfer result to host */
  125.  
    ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4 * 4 * sizeof(float),
  126.  
    C, 0, NULL, NULL);
  127.  
     
  128.  
    //end of execution
  129.  
    finish = clock();
  130.  
    duration = (double)(finish - start) / CLOCKS_PER_SEC;
  131.  
    printf("\n%f seconds\n", duration);
  132.  
     
  133.  
    /* Display Results */
  134.  
    printf("Calculation result:\n");
  135.  
    for (i = 0; i < 4; i++) {
  136.  
    for (j = 0; j < 4; j++) {
  137.  
    printf("%7.2f\t", C[i * 4 + j]);
  138.  
    }
  139.  
    printf("\n");
  140.  
    }
  141.  
     
  142.  
     
  143.  
    /* Finalization */
  144.  
    ret = clFlush(command_queue);
  145.  
    ret = clFinish(command_queue);
  146.  
    ret = clReleaseKernel(kernel);
  147.  
    ret = clReleaseProgram(program);
  148.  
    ret = clReleaseMemObject(Amobj);
  149.  
    ret = clReleaseMemObject(Bmobj);
  150.  
    ret = clReleaseMemObject(Cmobj);
  151.  
    ret = clReleaseCommandQueue(command_queue);
  152.  
    ret = clReleaseContext(context);
  153.  
     
  154.  
    free(source_str);
  155.  
     
  156.  
    free(A);
  157.  
    free(B);
  158.  
    free(C);
  159.  
    system("pause");
  160.  
    return 0;
  161.  
    }

kernel.cl: 

  1.  
    __kernel void dataParallel(__global float* A, __global float* B, __global float* C)
  2.  
    {
  3.  
    int base = 4*get_global_id(0);
  4.  
    C[base+0] = A[base+0] + B[base+0];
  5.  
    C[base+1] = A[base+1] - B[base+1];
  6.  
    C[base+2] = A[base+2] * B[base+2];
  7.  
    C[base+3] = A[base+3] / B[base+3];
  8.  
    }

2、任务并行(task parallel)

另外还有一种就是任务并行化,可以使所有功能函数内部的语句并行执行,即任务并行化,如本文中的功能函数可以分解为“加减乘除”这四个任务,可以产生“加减乘除”四个核函数,让四个函数同时执行,如下图所示。

图4、任务并行方法图

以图4中的红色核函数为例,执行的是数组A和数组B中第一列的加法运行,此加法核函数随着时间运行,分别执行了A[0] + B[0]、A[4] + B[4]、A[8] + B[8]和A[12] + B[12]。

数据化并行使用的OpenCL的API函数是:clEnqueueTask()

以下是参考程序:

host.cpp: 

  1.  
    // taskparallel.cpp : 定义控制台应用程序的入口点。
  2.  
    //
  3.  
     
  4.  
    #include "stdafx.h"
  5.  
    #include <string>
  6.  
    #include <CL/cl.h>
  7.  
    #include <time.h>
  8.  
    #define MAX_SOURCE_SIZE (0x100000)
  9.  
     
  10.  
    int main()
  11.  
    {
  12.  
    cl_platform_id platform_id = NULL;
  13.  
    cl_device_id device_id = NULL;
  14.  
    cl_context context = NULL;
  15.  
    cl_command_queue command_queue = NULL;
  16.  
    cl_mem Amobj = NULL;
  17.  
    cl_mem Bmobj = NULL;
  18.  
    cl_mem Cmobj = NULL;
  19.  
    cl_program program = NULL;
  20.  
    cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
  21.  
    cl_uint ret_num_devices;
  22.  
    cl_uint ret_num_platforms;
  23.  
    cl_int ret;
  24.  
     
  25.  
    int i,j;
  26.  
    float *A, *B, *C;
  27.  
     
  28.  
    A = (float *) malloc(4*4*sizeof(float));
  29.  
    B = (float *) malloc(4*4*sizeof(float));
  30.  
    C = (float *) malloc(4*4*sizeof(float));
  31.  
     
  32.  
    FILE *fp;
  33.  
    const char fileName[] = "./taskParallel.cl";
  34.  
    size_t source_size;
  35.  
    char *source_str;
  36.  
     
  37.  
    //load kernel source file
  38.  
    fp = fopen(fileName, "rb");
  39.  
    if(!fp) {
  40.  
    fprintf(stderr, "Failed to load kernel\n");
  41.  
    exit(1);
  42.  
    }
  43.  
     
  44.  
    source_str = (char *)malloc(MAX_SOURCE_SIZE);
  45.  
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
  46.  
    fclose(fp);
  47.  
     
  48.  
    //initialize input data
  49.  
    for(i=0; i<4; i++) {
  50.  
    for(j=0; j<4; j++) {
  51.  
    A[i*4+j] = i*4+j+1;
  52.  
    B[i*4+j] = j*4+i+1;
  53.  
    }
  54.  
    }
  55.  
     
  56.  
    //print A
  57.  
    printf("\nA initilization data: \n");
  58.  
    for(i=0; i<4; i++) {
  59.  
    for(j=0; j<4; j++) {
  60.  
    printf("%.2f\t", A[i*4+j]);
  61.  
    }
  62.  
    printf("\n");
  63.  
    }
  64.  
     
  65.  
    //print B
  66.  
    printf("\nB initilization data: \n");
  67.  
    for(i=0; i<4; i++) {
  68.  
    for(j=0; j<4; j++) {
  69.  
    printf("%.2f\t", B[i*4+j]);
  70.  
    }
  71.  
    printf("\n");
  72.  
    }
  73.  
     
  74.  
    clock_t start, finish;
  75.  
    double duration;
  76.  
    printf("TaskParallel kernels start to execute\n");
  77.  
    start = clock();
  78.  
     
  79.  
     
  80.  
    //get platform/device information
  81.  
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  82.  
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT,1,&device_id, &ret_num_devices);
  83.  
     
  84.  
    //create opencl context
  85.  
    context = clCreateContext(NULL, 1,&device_id, NULL, NULL, &ret);
  86.  
     
  87.  
    //create command queue
  88.  
    command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
  89.  
     
  90.  
    //create buffer object
  91.  
    Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL,&ret);
  92.  
    Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL,&ret);
  93.  
    Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL,&ret);
  94.  
     
  95.  
    //copy input data to memory buffer
  96.  
    ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
  97.  
    ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
  98.  
     
  99.  
    //create kernel from source
  100.  
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
  101.  
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  102.  
     
  103.  
    //create task parallel
  104.  
    kernel[0] = clCreateKernel(program, "add_parallel", &ret);
  105.  
    kernel[1] = clCreateKernel(program, "sub_parallel", &ret);
  106.  
    kernel[2] = clCreateKernel(program, "mul_parallel", &ret);
  107.  
    kernel[3] = clCreateKernel(program, "div_parallel", &ret);
  108.  
     
  109.  
    //set opencl kernel arguments
  110.  
    for (i=0; i<4; i++) {
  111.  
    ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *) &Amobj);
  112.  
    ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *) &Bmobj);
  113.  
    ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *) &Cmobj);
  114.  
    }
  115.  
     
  116.  
    //execute opencl kernels
  117.  
    for(i=0; i<4; i++) {
  118.  
    ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL);
  119.  
    }
  120.  
     
  121.  
    //copy result to host
  122.  
    ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
  123.  
     
  124.  
    //end of execution
  125.  
    finish = clock();
  126.  
    duration = (double)(finish - start) / CLOCKS_PER_SEC;
  127.  
    printf("\n%f seconds\n", duration);
  128.  
     
  129.  
    //display result
  130.  
    printf("\nC result: \n");
  131.  
    for(i=0; i<4; i++) {
  132.  
    for(j=0; j<4; j++) {
  133.  
    printf("%.2f\t", C[i*4+j]);
  134.  
    }
  135.  
    printf("\n");
  136.  
    }
  137.  
    printf("\n");
  138.  
     
  139.  
    //free
  140.  
    ret = clFlush(command_queue);
  141.  
    ret = clFinish(command_queue);
  142.  
    ret = clReleaseKernel(kernel[0]);
  143.  
    ret = clReleaseKernel(kernel[1]);
  144.  
    ret = clReleaseKernel(kernel[2]);
  145.  
    ret = clReleaseKernel(kernel[3]);
  146.  
    ret = clReleaseProgram(program);
  147.  
    ret = clReleaseMemObject(Amobj);
  148.  
    ret = clReleaseMemObject(Bmobj);
  149.  
    ret = clReleaseMemObject(Cmobj);
  150.  
    ret = clReleaseCommandQueue(command_queue);
  151.  
    ret = clReleaseContext(context);
  152.  
     
  153.  
    free(source_str);
  154.  
    free(A);
  155.  
    free(B);
  156.  
    free(C);
  157.  
     
  158.  
    system("pause");
  159.  
    return 0;
  160.  
    }
  161.  
     

kernel.cl:

  1.  
    __kernel void add_parallel(__global float *A, __global float *B, __global float *C)
  2.  
    {
  3.  
    int base = 0;
  4.  
     
  5.  
    for(int i=0;i<4;i++)
  6.  
    {
  7.  
    C[base+i*4] = A[base+i*4] + B[base+i*4];
  8.  
    }
  9.  
    //C[base+0] = A[base+0] + B[base+0];
  10.  
    //C[base+4] = A[base+4] + B[base+4];
  11.  
    //C[base+6] = A[base+8] + B[base+8];
  12.  
    //C[base+12] = A[base+12] + B[base+12];
  13.  
    }
  14.  
     
  15.  
    __kernel void sub_parallel(__global float *A, __global float *B, __global float *C)
  16.  
    {
  17.  
    int base = 1;
  18.  
     
  19.  
    for(int i=0;i<4;i++)
  20.  
    {
  21.  
    C[base+i*4] = A[base+i*4] - B[base+i*4];
  22.  
    }
  23.  
    }
  24.  
     
  25.  
    __kernel void mul_parallel(__global float *A, __global float *B, __global float *C)
  26.  
    {
  27.  
    int base=2;
  28.  
    for(int i=0; i<4; i++)
  29.  
    {
  30.  
    C[base+i*4] = A[base+i*4]*B[base+i*4];
  31.  
    }
  32.  
    }
  33.  
     
  34.  
     
  35.  
    __kernel void div_parallel(__global float *A, __global float *B, __global float *C)
  36.  
    {
  37.  
    int base = 3;
  38.  
    for(int i=0; i<4; i++)
  39.  
    {
  40.  
    C[base+i*4] = A[base+i*4] / B[base+i*4];
  41.  
    }
  42.  
    }

3、参考

例子及程序来自《The OpenCL Programming Book》,以上例子其实还可以并行化,只要需要足够多的并行度,完全可以利用16个任务一起算,即让加减乘除四个任务里的四个按时间执行的任务同时计算。

posted @ 2018-10-22 10:43  yaphetsfang  阅读(3186)  评论(0编辑  收藏  举报