| 1. 用CUDA计算 pow(sin(id),2)+ pow(cos(id),2)的结果 |
| 2. 对比单流(同步传输、异步传输)、多流深度优先调度、多流广度优先调度的效率(包含数据传输和计算) |
| 3. 使用接口错误检查宏 |
| */ |
| |
| #include <stdio.h> |
| |
| #define CUDA_ERROR_CHECK |
| |
| #define BLOCKSIZE 256 |
| int N = 1<<28; |
| int NBytes = N*sizeof(float); |
| |
| |
| |
| #define CudaSafecCall(err) __cudaSafeCall(err,__FILE__,__LINE__) |
| inline void __cudaSafeCall(cudaError_t err,const char* file,const int line) |
| { |
| #ifdef CUDA_ERROR_CHECK |
| if(err!=cudaSuccess) |
| { |
| fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err)); |
| exit(-1); |
| } |
| #endif |
| } |
| |
| |
| #define CudaCheckError() _cudaCheckError(__FILE__,__LINE__) |
| inline void _cudaCheckError(const char * file,const int line) |
| { |
| #ifdef CUDA_ERROR_CHECK |
| cudaError_t err = cudaGetLastError(); |
| if(err != cudaSuccess) |
| { |
| fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err)); |
| exit(-1); |
| } |
| #endif |
| } |
| |
| __global__ void kernel_func(float * arr,int offset,const int n) |
| { |
| int id = offset + threadIdx.x + blockIdx.x * blockDim.x; |
| if(id<n) |
| arr[id] = pow(sinf(id),2) + pow(cosf(id),2); |
| } |
| |
| |
| float gpu_base() |
| { |
| |
| float* hostA,*deviceA; |
| hostA = (float*)calloc(N,sizeof(float)); |
| CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes)); |
| |
| |
| float gpuTime = 0.0; |
| cudaEvent_t start,end; |
| CudaSafecCall(cudaEventCreate(&start)); |
| CudaSafecCall(cudaEventCreate(&end)); |
| CudaSafecCall(cudaEventRecord(start)); |
| |
| CudaSafecCall(cudaMemcpy(deviceA,hostA,NBytes,cudaMemcpyHostToDevice)); |
| kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N); |
| CudaCheckError(); |
| |
| CudaSafecCall(cudaEventRecord(end)); |
| CudaSafecCall(cudaEventSynchronize(end)); |
| CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end)); |
| CudaSafecCall(cudaEventDestroy(start)); |
| CudaSafecCall(cudaEventDestroy(end)); |
| |
| CudaSafecCall(cudaMemcpy(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost)); |
| |
| printf("gpu_base 单流非锁页内存,数据传输和计算耗时 %f ms\n",gpuTime); |
| CudaSafecCall(cudaFree(deviceA)); |
| free(hostA); |
| return gpuTime; |
| } |
| |
| |
| float gpu_base_pinMem() |
| { |
| |
| float* hostA,*deviceA; |
| CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes)); |
| CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes)); |
| |
| float gpuTime = 0.0; |
| cudaEvent_t start,end; |
| CudaSafecCall(cudaEventCreate(&start)); |
| CudaSafecCall(cudaEventCreate(&end)); |
| CudaSafecCall(cudaEventRecord(start)); |
| |
| CudaSafecCall(cudaMemcpyAsync(deviceA,hostA,NBytes,cudaMemcpyHostToDevice)); |
| kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N); |
| CudaCheckError(); |
| |
| CudaSafecCall(cudaEventRecord(end)); |
| CudaSafecCall(cudaEventSynchronize(end)); |
| CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end)); |
| CudaSafecCall(cudaEventDestroy(start)); |
| CudaSafecCall(cudaEventDestroy(end)); |
| |
| CudaSafecCall(cudaMemcpyAsync(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost)); |
| |
| printf("gpu_base_pinMem 单流锁页内存,数据传输和计算耗时 %f ms\n",gpuTime); |
| |
| CudaSafecCall(cudaFreeHost(hostA)); |
| CudaSafecCall(cudaFree(deviceA)); |
| return gpuTime; |
| } |
| |
| |
| float gpu_MStream_deep(int nStreams) |
| { |
| |
| float* hostA,*deviceA; |
| |
| CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes)); |
| CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes)); |
| |
| float gpuTime = 0.0; |
| cudaEvent_t start,end; |
| cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t)); |
| for(int i=0;i<nStreams;i++) |
| CudaSafecCall(cudaStreamCreate(streams+i)); |
| CudaSafecCall(cudaEventCreate(&start)); |
| CudaSafecCall(cudaEventCreate(&end)); |
| CudaSafecCall(cudaEventRecord(start)); |
| |
| |
| |
| int nByStream = N/nStreams; |
| for(int i=0;i<nStreams;i++) |
| { |
| int offset = i * nByStream; |
| CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i])); |
| kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream); |
| CudaCheckError(); |
| CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i])); |
| } |
| |
| for(int i=0;i<nStreams;i++) |
| CudaSafecCall(cudaStreamSynchronize(streams[i])); |
| |
| CudaSafecCall(cudaEventRecord(end)); |
| CudaSafecCall(cudaEventSynchronize(end)); |
| CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end)); |
| CudaSafecCall(cudaEventDestroy(start)); |
| CudaSafecCall(cudaEventDestroy(end)); |
| |
| printf("gpu_MStream_deep %d个流深度优先调度,数据传输和计算耗时 %f ms\n",nStreams,gpuTime); |
| |
| for(int i=0;i<nStreams;i++) |
| CudaSafecCall(cudaStreamDestroy(streams[i])); |
| |
| CudaSafecCall(cudaFreeHost(hostA)); |
| CudaSafecCall(cudaFree(deviceA)); |
| free(streams); |
| return gpuTime; |
| } |
| |
| |
| float gpu_MStream_wide(int nStreams) |
| { |
| |
| float* hostA,*deviceA; |
| |
| CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes)); |
| CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes)); |
| |
| float gpuTime = 0.0; |
| cudaEvent_t start,end; |
| cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t)); |
| for(int i=0;i<nStreams;i++) |
| CudaSafecCall(cudaStreamCreate(streams+i)); |
| CudaSafecCall(cudaEventCreate(&start)); |
| CudaSafecCall(cudaEventCreate(&end)); |
| CudaSafecCall(cudaEventRecord(start)); |
| |
| |
| |
| int nByStream = N/nStreams; |
| for(int i=0;i<nStreams;i++) |
| { |
| int offset = i * nByStream; |
| CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i])); |
| } |
| for(int i=0;i<nStreams;i++) |
| { |
| int offset = i * nByStream; |
| kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream); |
| CudaCheckError(); |
| } |
| for(int i=0;i<nStreams;i++) |
| { |
| int offset = i * nByStream; |
| CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i])); |
| } |
| |
| for(int i=0;i<nStreams;i++) |
| CudaSafecCall(cudaStreamSynchronize(streams[i])); |
| |
| CudaSafecCall(cudaEventRecord(end)); |
| CudaSafecCall(cudaEventSynchronize(end)); |
| CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end)); |
| CudaSafecCall(cudaEventDestroy(start)); |
| CudaSafecCall(cudaEventDestroy(end)); |
| |
| printf("gpu_MStream_wide %d个流广度优先调度,数据传输和计算耗时 %f ms\n",nStreams,gpuTime); |
| |
| for(int i=0;i<nStreams;i++) |
| CudaSafecCall(cudaStreamDestroy(streams[i])); |
| |
| CudaSafecCall(cudaFreeHost(hostA)); |
| CudaSafecCall(cudaFree(deviceA)); |
| free(streams); |
| return gpuTime; |
| } |
| |
| int main(int argc,char* argv[]) |
| { |
| int nStreams = argc==2? atoi(argv[1]):4; |
| |
| |
| float gpuTime1 = gpu_base(); |
| |
| |
| float gpuTime2 = gpu_base_pinMem(); |
| |
| |
| float gpuTime3 = gpu_MStream_deep(nStreams); |
| |
| |
| float gpuTime4 = gpu_MStream_wide(nStreams); |
| |
| printf("相比默认单流同步传输与计算,单流异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime2); |
| printf("相比默认单流同步传输与计算,%d 个流深度优先调度异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime3); |
| printf("相比默认单流同步传输与计算,%d 个流广度优先调度异步传输及运算加速比为 %f\n",nStreams,gpuTime1/gpuTime4); |
| return 0; |
| } |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂