高性能计算-CUDA单流/多流调度(24)

1. 介绍:

(1) 用CUDA计算 pow(sin(id),2)+ pow(cos(id),2)的结果
(2) 对比单流(同步传输、异步传输)、多流深度优先调度、多流广度优先调度的效率(包含数据传输和计算)

核心代码

1. 用CUDA计算 pow(sin(id),2)+ pow(cos(id),2)的结果
2. 对比单流(同步传输、异步传输)、多流深度优先调度、多流广度优先调度的效率(包含数据传输和计算)
3. 使用接口错误检查宏
*/

#include <stdio.h>

#define CUDA_ERROR_CHECK    //API检查控制宏

#define BLOCKSIZE 256
int N = 1<<28;              //数据个数
int NBytes = N*sizeof(float); //数据字节数


//宏定义检查API调用是否出错
#define CudaSafecCall(err) __cudaSafeCall(err,__FILE__,__LINE__)
inline void __cudaSafeCall(cudaError_t err,const char* file,const int line)
{
    #ifdef CUDA_ERROR_CHECK
    if(err!=cudaSuccess)
    {
        fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
        exit(-1);
    }
    #endif
}

//宏定义检查获取流中的执行错误,主要是对核函数
#define CudaCheckError() _cudaCheckError(__FILE__,__LINE__)
inline void _cudaCheckError(const char * file,const int line)
{
    #ifdef CUDA_ERROR_CHECK
    cudaError_t err = cudaGetLastError();
    if(err != cudaSuccess)
    {
        fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
        exit(-1);
    }
    #endif
}

__global__ void kernel_func(float * arr,int offset,const int n)
{
    int id = offset + threadIdx.x + blockIdx.x * blockDim.x;
    if(id<n)
        arr[id] = pow(sinf(id),2) + pow(cosf(id),2);
}

//单流主机非锁页内存,同步传输
float gpu_base()
{
    //开辟主机非锁页内存空间
    float* hostA,*deviceA;
    hostA = (float*)calloc(N,sizeof(float));
    CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));

    
    float gpuTime = 0.0;
    cudaEvent_t start,end;
    CudaSafecCall(cudaEventCreate(&start));
    CudaSafecCall(cudaEventCreate(&end));
    CudaSafecCall(cudaEventRecord(start));
    
    CudaSafecCall(cudaMemcpy(deviceA,hostA,NBytes,cudaMemcpyHostToDevice));
    kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N);
    CudaCheckError();

    CudaSafecCall(cudaEventRecord(end));
    CudaSafecCall(cudaEventSynchronize(end));
    CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
    CudaSafecCall(cudaEventDestroy(start));
    CudaSafecCall(cudaEventDestroy(end));

    CudaSafecCall(cudaMemcpy(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost));

    printf("gpu_base 单流非锁页内存,数据传输和计算耗时 %f ms\n",gpuTime);
    CudaSafecCall(cudaFree(deviceA));
    free(hostA);
    return gpuTime;
}

//单流主机锁页内存,异步传输
float gpu_base_pinMem()
{
    //开辟主机锁页内存空间
    float* hostA,*deviceA;
    CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));
    CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
    
    float gpuTime = 0.0;
    cudaEvent_t start,end;
    CudaSafecCall(cudaEventCreate(&start));
    CudaSafecCall(cudaEventCreate(&end));
    CudaSafecCall(cudaEventRecord(start));
    
    CudaSafecCall(cudaMemcpyAsync(deviceA,hostA,NBytes,cudaMemcpyHostToDevice));
    kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N);
    CudaCheckError();

    CudaSafecCall(cudaEventRecord(end));
    CudaSafecCall(cudaEventSynchronize(end));
    CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
    CudaSafecCall(cudaEventDestroy(start));
    CudaSafecCall(cudaEventDestroy(end));

    CudaSafecCall(cudaMemcpyAsync(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost));

    printf("gpu_base_pinMem 单流锁页内存,数据传输和计算耗时 %f ms\n",gpuTime);

    CudaSafecCall(cudaFreeHost(hostA));
    CudaSafecCall(cudaFree(deviceA));
    return gpuTime;
}

//多流深度优先调度
float gpu_MStream_deep(int nStreams)
{
    //开辟主机非锁页内存空间
    float* hostA,*deviceA;
    //异步传输必须用锁页主机内存
    CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));
    CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
    
    float gpuTime = 0.0;
    cudaEvent_t start,end;
    cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t));
    for(int i=0;i<nStreams;i++)
        CudaSafecCall(cudaStreamCreate(streams+i));
    CudaSafecCall(cudaEventCreate(&start));
    CudaSafecCall(cudaEventCreate(&end));
    CudaSafecCall(cudaEventRecord(start));
    
    //传输、计算,流间最多只有一个传输和一个计算同时进行
    // 每个流处理的数据量
    int nByStream = N/nStreams;
    for(int i=0;i<nStreams;i++)
    {
        int offset = i * nByStream;
        CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
        kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream);
        CudaCheckError();
        CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));
    }

    for(int i=0;i<nStreams;i++)
        CudaSafecCall(cudaStreamSynchronize(streams[i]));

    CudaSafecCall(cudaEventRecord(end));
    CudaSafecCall(cudaEventSynchronize(end));
    CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
    CudaSafecCall(cudaEventDestroy(start));
    CudaSafecCall(cudaEventDestroy(end));

    printf("gpu_MStream_deep %d个流深度优先调度,数据传输和计算耗时 %f ms\n",nStreams,gpuTime);

    for(int i=0;i<nStreams;i++)
        CudaSafecCall(cudaStreamDestroy(streams[i]));

    CudaSafecCall(cudaFreeHost(hostA));
    CudaSafecCall(cudaFree(deviceA));
    free(streams);
    return gpuTime;
}

//多流广度优先调度
float gpu_MStream_wide(int nStreams)
{
    //开辟主机非锁页内存空间
    float* hostA,*deviceA;
    //异步传输必须用锁页主机内存
    CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));
    CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
    
    float gpuTime = 0.0;
    cudaEvent_t start,end;
    cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t));
    for(int i=0;i<nStreams;i++)
        CudaSafecCall(cudaStreamCreate(streams+i));
    CudaSafecCall(cudaEventCreate(&start));
    CudaSafecCall(cudaEventCreate(&end));
    CudaSafecCall(cudaEventRecord(start));
    
    //传输、计算,流间并行
    // 每个流处理的数据量
    int nByStream = N/nStreams;
    for(int i=0;i<nStreams;i++)
    {
        int offset = i * nByStream;
        CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
    }
    for(int i=0;i<nStreams;i++)
    {
        int offset = i * nByStream;
        kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream);
        CudaCheckError();
    }
    for(int i=0;i<nStreams;i++)
    {
        int offset = i * nByStream;
        CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));
    }

    for(int i=0;i<nStreams;i++)
        CudaSafecCall(cudaStreamSynchronize(streams[i]));

    CudaSafecCall(cudaEventRecord(end));
    CudaSafecCall(cudaEventSynchronize(end));
    CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
    CudaSafecCall(cudaEventDestroy(start));
    CudaSafecCall(cudaEventDestroy(end));

    printf("gpu_MStream_wide %d个流广度优先调度,数据传输和计算耗时 %f ms\n",nStreams,gpuTime);

    for(int i=0;i<nStreams;i++)
        CudaSafecCall(cudaStreamDestroy(streams[i]));

    CudaSafecCall(cudaFreeHost(hostA));
    CudaSafecCall(cudaFree(deviceA));
    free(streams);
    return gpuTime;
}

int main(int argc,char* argv[])
{
    int nStreams = argc==2? atoi(argv[1]):4;

    //gpu默认单流,主机非锁页内存,同步传输
    float gpuTime1 = gpu_base();

    //gpu默认单流,主机锁页内存,异步传输
    float gpuTime2 = gpu_base_pinMem();

    //gpu多流深度优先调度,异步传输
    float gpuTime3 = gpu_MStream_deep(nStreams);

    //gpu多流广度优先调度,异步传输
    float gpuTime4 = gpu_MStream_wide(nStreams);

    printf("相比默认单流同步传输与计算,单流异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime2);
    printf("相比默认单流同步传输与计算,%d 个流深度优先调度异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime3);
    printf("相比默认单流同步传输与计算,%d 个流广度优先调度异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime4);
    return 0;
}

3. 测试结果

各项测试耗时及与单流同步传输加速比

项目\流数量 1 4 8 16 32 64
单流同步传输(耗时ms) 306.7 - - - - -
单流异步传输(耗时ms/加速比) 199.4/1.53 - - - - -
多流深度优先调度(耗时ms/加速比) - 151.04/2.06 129.95/2.29 131.49/2.32 123.08/2.49 126.48/2.45
多流广度优先调度(耗时ms/加速比) - 149.29/2.09 129.6/2.3 134.55/2.27 122.82/2.49 126.42/2.45

4. 结果分析

(1) 单流异步传输比同步传输明显效率更高,这是因为同步传输PCIE 通过 DMA 只能访问锁页内存,同步传输使用的主机内存地址是虚拟非锁页内存地址,相比异步传输同步传输额外增加了非锁页向锁页内存转换的开销;

(2) 多流比单流因为不同流的计算与传输重叠(overlap),有大约1.5倍的加速;

(3) 多流的在两个测试项中随着流数量的增加,加速比从 2.06 到 2.4 有明显提升;

(4) 多流广度优先相比深度优先调度在 2^28数据规模下效率几乎一致,可能因为数据规模较大,硬件资源紧张无法真正实现多流并发的优势。经多次测试,使用数据规模 2^20,流2-8个时 ,广度优先的加速比能提升 2%左右,随着流数的增加广度优先效率反而不如深度优先。

posted @ 2025-01-07 11:48  安洛8  阅读(107)  评论(0)    收藏  举报