高性能计算-GPU单进程多卡(多流)并行计算编程模型示例(25)
1. 简介
(1) 使用CPU对向量点乘进行串行计算
(2) 对数据进行分块,使用单进程多卡(多流)并行计算
(3) 使用不同数据规模,比较加速比的变化
2. 代码
#include <stdio.h> #include <sys/time.h> #include <stdlib.h> #define CUDA_ERROR_CHECK int nGpus = 1; //gpu数量 int blockSize = 256; //线程块大小 int leftBit = 10; //数据规模左移位数 unsigned long nSize = 1LL << leftBit; //方阵维度 float *hostA = NULL; //向量 A float *hostB = NULL; //向量 B float *hostResult = NULL; //串行计算结果 float *deviceResult = NULL; //gpu计算结果 //宏定义检查API调用是否出错 #define CudaCall(err) __cudaSafeCall(err,__FILE__,__LINE__) inline void __cudaSafeCall(cudaError_t err,const char* file,const int line) { #ifdef CUDA_ERROR_CHECK if(err!=cudaSuccess) { fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err)); exit(-1); } #endif } //宏定义检查获取流中的执行错误,主要是对核函数 #define CudaCheck() _cudaCheckError(__FILE__,__LINE__) inline void _cudaCheckError(const char * file,const int line) { #ifdef CUDA_ERROR_CHECK cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err)); exit(-1); } #endif } //ms long getTime() { struct timeval cur; gettimeofday(&cur, NULL); // printf("sec %ld usec %ld,toal ms %ld\n",cur.tv_sec,cur.tv_usec,cur.tv_sec*1e3 + cur.tv_usec / 1e3); return cur.tv_sec*1e3 + cur.tv_usec / 1e3; } void initData(float *A,float *B,unsigned long len) { //设置随机数种子 srand(0); // len = 10; for(unsigned long i=0;i<len;i++) { A[i] = (float)rand()/RAND_MAX; B[i] = (float)rand()/RAND_MAX; // printf("%f %f\n",A[i],B[i]); } } //cpu 串行计算 long serial(unsigned long len) { long start = getTime(); for(unsigned long i=0;i<len;i++) hostResult[i] = hostA[i] * hostB[i]; long end = getTime(); // printf("cpu time %d\n",end-start); return end-start; } __global__ void kernel(float *A,float *B,float *result,unsigned long len) { unsigned long id = blockIdx.x * blockDim.x + threadIdx.x; if(id<len) result[id] = A[id] * B[id]; } //gpu多卡并行 float gpu_multi(float *result,unsigned long len,int ngpus) { float gpuTime = 0.0; //对数据分块,每个gpu上开辟内存空间存储数据,并创建一个流,每个GPU计算自己的数据 //每个流GPU处理的数据个数 unsigned long nPerGpu = len/ngpus; float **deviceA,**deviceB,**deviceResult; deviceA = (float**)calloc(ngpus,sizeof(float*)); deviceB = (float**)calloc(ngpus,sizeof(float*)); deviceResult = (float**)calloc(ngpus,sizeof(float*)); cudaStream_t *streams = (cudaStream_t*)calloc(ngpus,sizeof(cudaStream_t)); //在gpu上分配内存空间 for(int i=0;i<ngpus;i++) { CudaCall(cudaSetDevice(i)); CudaCall(cudaMalloc((void**)&deviceA[i],nPerGpu*sizeof(float))); CudaCall(cudaMalloc((void**)&deviceB[i],nPerGpu*sizeof(float))); CudaCall(cudaMalloc((void**)&deviceResult[i],nPerGpu*sizeof(float))); CudaCall(cudaStreamCreate(streams+i)); } //事件记录在默认流 cudaEvent_t start,end; CudaCall(cudaSetDevice(0)); CudaCall(cudaEventCreate(&start)); CudaCall(cudaEventCreate(&end)); CudaCall(cudaEventRecord(start,streams[0])); for(int i=0;i<ngpus;i++) { CudaCall(cudaSetDevice(i)); //异步数据拷贝 CudaCall(cudaMemcpyAsync(deviceA[i],hostA+i*nPerGpu,nPerGpu*sizeof(float),cudaMemcpyHostToDevice,streams[i])); CudaCall(cudaMemcpyAsync(deviceB[i],hostB+i*nPerGpu,nPerGpu*sizeof(float),cudaMemcpyHostToDevice,streams[i])); //计算 int gridDim = (nPerGpu-1)/blockSize + 1; kernel<<<gridDim,blockSize,0,streams[i]>>>(deviceA[i],deviceB[i],deviceResult[i],nPerGpu); CudaCheck(); //异步拷贝数据 CudaCall(cudaMemcpyAsync(result+i*nPerGpu,deviceResult[i],nPerGpu*sizeof(float),cudaMemcpyDeviceToHost,streams[i])); } CudaCall(cudaSetDevice(0)); CudaCall(cudaEventRecord(end,streams[0])); //流同步 for(int i=0;i<ngpus;i++) { CudaCall(cudaSetDevice(i)); CudaCall(cudaStreamSynchronize(streams[i])); } // CudaCall(cudaEventSynchronize(end)); CudaCall(cudaEventElapsedTime(&gpuTime,start,end)); //free CudaCall(cudaEventDestroy(start)); CudaCall(cudaEventDestroy(end)); for(int i=0;i<ngpus;i++) { CudaCall(cudaSetDevice(i)); CudaCall(cudaFree(deviceA[i])); CudaCall(cudaFree(deviceB[i])); CudaCall(cudaFree(deviceResult[i])); CudaCall(cudaStreamDestroy(streams[i])); } cudaFree(deviceA); cudaFree(deviceB); cudaFree(deviceResult); free(streams); // printf("gpu time %f\n",gpuTime); return gpuTime; } int main(int argc, char* argv[]) { cudaDeviceProp prop; int globalMemSize = 0; int memSize = 0; //对单卡显存需求大小 CudaCall(cudaGetDeviceProperties(&prop ,0)); globalMemSize = (float)prop.totalGlobalMem/1024/1024; // printf("compute capability %d.%d\n", prop.major,prop.minor);//k80 3.7 // printf("Memory clock rate: %d\n",prop.memoryClockRate); // printf("global memory:%dMB\n",globalMemSize); //获得 device 数量 CudaCall(cudaGetDeviceCount(&nGpus)); //限制参数设置的最大gpu数量 if(argc==3) { leftBit = atoi(argv[2]); nSize = 1LL << leftBit; int n = atoi(argv[1]); //当gpu数量设置为3时,nSize%n !=0,使用最大gpu数量计算 nGpus = ((n > nGpus || nSize%n !=0)?nGpus:n); memSize = nSize*sizeof(float)*3/nGpus/1024/1024; //判断显存是否够用,k80 单卡可用显存为 11441MB if(memSize > globalMemSize) { printf("one gpu memory not enough gater %dMB\n",globalMemSize); exit(-1); } } else { printf("parameter 1:ngpus 2:matrix dim 2^(_)\n"); exit(-1); } unsigned long nBytes = nSize * sizeof(float); //单个向量字节数 //数据初始化,开辟主机锁页内存 // hostA = (float*)calloc(nSize,sizeof(float)); // hostB = (float*)calloc(nSize,sizeof(float)); // hostResult = (float*)calloc(nSize,sizeof(float)); CudaCall(cudaMallocHost((void**)&hostA,nBytes)); CudaCall(cudaMallocHost((void**)&hostB,nBytes)); CudaCall(cudaMallocHost((void**)&hostResult,nBytes)); CudaCall(cudaMallocHost((void**)&deviceResult,nBytes)); initData(hostA,hostB,nSize); //串行计算 long cpuTime = serial(nSize); //多GPU计算 float gpuTime = gpu_multi(deviceResult,nSize,nGpus); printf("单个向量长度 2^%ld,单个显卡三个数组需要显存 %dMB,使用 %d个GPU,cpu串行耗时 %ldms,GPU并行数据传输和计算耗时 %fms,加速比: %f\n",\ leftBit,memSize,nGpus,cpuTime,gpuTime,cpuTime/gpuTime); cudaFreeHost(hostA); cudaFreeHost(hostB); cudaFreeHost(hostResult); cudaFreeHost(deviceResult); return 0; }
3. 测试脚本
#!/bin/bash # 编译 nvcc pointMul.cu -o pointMul dir=out # 清空文件夹 > "$dir" echo "start $(date)" >> out # 串行计算 # for((i=0;i<4;i++)); do # yhrun -N1 -n1 -pTH_GPU ./matrix_add2D 0 | tee -a "$dir" # done # 显卡数量 nGpus=(1 2 3 4) # 数据规模 2^(S) S=(24 28 30 31) # gpu for n in "${nGpus[@]}"; do for s in "${S[@]}"; do for((i=0;i<3;i++)); do yhrun -N1 -n1 -pTH_GPU ./pointMul "$n" "$s" | tee -a "$dir" done done done echo "end $(date)" >> out
4. 测试数据
由于测试脚本的限制,CPU串行计算在GPU单卡(K80 12G显存)、双卡、四卡测试中分别跑了一轮,数据如下:
数据长度 | gpu单卡(ms) | gpu2个卡(ms) | gpu4个卡(ms) |
---|---|---|---|
2^24 | 44.3 | 44.7 | 44 |
2^28 | 719.7 | 717.3 | 703.3 |
2^30 | - | 2988.7 | 2951.7 |
2^31 | - | - | 5906.7 |
GPU测试耗时及加速比数据:
数据长度 | gpu单卡(ms) | gpu2个卡(ms) | gpu4个卡(ms) |
---|---|---|---|
2^24(耗时/加速比) | 27.9/1.6 | 17.7/2.5 | 17.1/2.6 |
2^28(耗时/加速比) | 399.8/1.8 | 273.6/2.6 | 273.9/2.6 |
2^30(耗时/加速比) | 显存不足 | 985/3.0 | 1056.3/2.8 |
2^31(耗时/加速比) | 显存不足 | 显存不足 | 1942.4/3.0 |
5. 结果分析
(1)GPU比CPU计算有明显的性能提升,根据数据规模,数据量越大提升越明显。
(2)GPU数量越多,计算效率提升越高,数据规模越大,提升越明显。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂