高性能计算-GPU单进程多卡(多流)并行计算编程模型示例(25)

1. 简介

(1) 使用CPU对向量点乘进行串行计算
(2) 对数据进行分块,使用单进程多卡(多流)并行计算
(3) 使用不同数据规模,比较加速比的变化

2. 代码

#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define CUDA_ERROR_CHECK
int nGpus = 1; //gpu数量
int blockSize = 256; //线程块大小
int leftBit = 10; //数据规模左移位数
unsigned long nSize = 1LL << leftBit; //方阵维度
float *hostA = NULL; //向量 A
float *hostB = NULL; //向量 B
float *hostResult = NULL; //串行计算结果
float *deviceResult = NULL; //gpu计算结果
//宏定义检查API调用是否出错
#define CudaCall(err) __cudaSafeCall(err,__FILE__,__LINE__)
inline void __cudaSafeCall(cudaError_t err,const char* file,const int line)
{
#ifdef CUDA_ERROR_CHECK
if(err!=cudaSuccess)
{
fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
exit(-1);
}
#endif
}
//宏定义检查获取流中的执行错误,主要是对核函数
#define CudaCheck() _cudaCheckError(__FILE__,__LINE__)
inline void _cudaCheckError(const char * file,const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess)
{
fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
exit(-1);
}
#endif
}
//ms
long getTime()
{
struct timeval cur;
gettimeofday(&cur, NULL);
// printf("sec %ld usec %ld,toal ms %ld\n",cur.tv_sec,cur.tv_usec,cur.tv_sec*1e3 + cur.tv_usec / 1e3);
return cur.tv_sec*1e3 + cur.tv_usec / 1e3;
}
void initData(float *A,float *B,unsigned long len)
{
//设置随机数种子
srand(0);
// len = 10;
for(unsigned long i=0;i<len;i++)
{
A[i] = (float)rand()/RAND_MAX;
B[i] = (float)rand()/RAND_MAX;
// printf("%f %f\n",A[i],B[i]);
}
}
//cpu 串行计算
long serial(unsigned long len)
{
long start = getTime();
for(unsigned long i=0;i<len;i++)
hostResult[i] = hostA[i] * hostB[i];
long end = getTime();
// printf("cpu time %d\n",end-start);
return end-start;
}
__global__ void kernel(float *A,float *B,float *result,unsigned long len)
{
unsigned long id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<len)
result[id] = A[id] * B[id];
}
//gpu多卡并行
float gpu_multi(float *result,unsigned long len,int ngpus)
{
float gpuTime = 0.0;
//对数据分块,每个gpu上开辟内存空间存储数据,并创建一个流,每个GPU计算自己的数据
//每个流GPU处理的数据个数
unsigned long nPerGpu = len/ngpus;
float **deviceA,**deviceB,**deviceResult;
deviceA = (float**)calloc(ngpus,sizeof(float*));
deviceB = (float**)calloc(ngpus,sizeof(float*));
deviceResult = (float**)calloc(ngpus,sizeof(float*));
cudaStream_t *streams = (cudaStream_t*)calloc(ngpus,sizeof(cudaStream_t));
//在gpu上分配内存空间
for(int i=0;i<ngpus;i++)
{
CudaCall(cudaSetDevice(i));
CudaCall(cudaMalloc((void**)&deviceA[i],nPerGpu*sizeof(float)));
CudaCall(cudaMalloc((void**)&deviceB[i],nPerGpu*sizeof(float)));
CudaCall(cudaMalloc((void**)&deviceResult[i],nPerGpu*sizeof(float)));
CudaCall(cudaStreamCreate(streams+i));
}
//事件记录在默认流
cudaEvent_t start,end;
CudaCall(cudaSetDevice(0));
CudaCall(cudaEventCreate(&start));
CudaCall(cudaEventCreate(&end));
CudaCall(cudaEventRecord(start,streams[0]));
for(int i=0;i<ngpus;i++)
{
CudaCall(cudaSetDevice(i));
//异步数据拷贝
CudaCall(cudaMemcpyAsync(deviceA[i],hostA+i*nPerGpu,nPerGpu*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
CudaCall(cudaMemcpyAsync(deviceB[i],hostB+i*nPerGpu,nPerGpu*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
//计算
int gridDim = (nPerGpu-1)/blockSize + 1;
kernel<<<gridDim,blockSize,0,streams[i]>>>(deviceA[i],deviceB[i],deviceResult[i],nPerGpu);
CudaCheck();
//异步拷贝数据
CudaCall(cudaMemcpyAsync(result+i*nPerGpu,deviceResult[i],nPerGpu*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));
}
CudaCall(cudaSetDevice(0));
CudaCall(cudaEventRecord(end,streams[0]));
//流同步
for(int i=0;i<ngpus;i++)
{
CudaCall(cudaSetDevice(i));
CudaCall(cudaStreamSynchronize(streams[i]));
}
// CudaCall(cudaEventSynchronize(end));
CudaCall(cudaEventElapsedTime(&gpuTime,start,end));
//free
CudaCall(cudaEventDestroy(start));
CudaCall(cudaEventDestroy(end));
for(int i=0;i<ngpus;i++)
{
CudaCall(cudaSetDevice(i));
CudaCall(cudaFree(deviceA[i]));
CudaCall(cudaFree(deviceB[i]));
CudaCall(cudaFree(deviceResult[i]));
CudaCall(cudaStreamDestroy(streams[i]));
}
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceResult);
free(streams);
// printf("gpu time %f\n",gpuTime);
return gpuTime;
}
int main(int argc, char* argv[])
{
cudaDeviceProp prop;
int globalMemSize = 0;
int memSize = 0; //对单卡显存需求大小
CudaCall(cudaGetDeviceProperties(&prop ,0));
globalMemSize = (float)prop.totalGlobalMem/1024/1024;
// printf("compute capability %d.%d\n", prop.major,prop.minor);//k80 3.7
// printf("Memory clock rate: %d\n",prop.memoryClockRate);
// printf("global memory:%dMB\n",globalMemSize);
//获得 device 数量
CudaCall(cudaGetDeviceCount(&nGpus));
//限制参数设置的最大gpu数量
if(argc==3)
{
leftBit = atoi(argv[2]);
nSize = 1LL << leftBit;
int n = atoi(argv[1]);
//当gpu数量设置为3时,nSize%n !=0,使用最大gpu数量计算
nGpus = ((n > nGpus || nSize%n !=0)?nGpus:n);
memSize = nSize*sizeof(float)*3/nGpus/1024/1024;
//判断显存是否够用,k80 单卡可用显存为 11441MB
if(memSize > globalMemSize)
{
printf("one gpu memory not enough gater %dMB\n",globalMemSize);
exit(-1);
}
}
else
{
printf("parameter 1:ngpus 2:matrix dim 2^(_)\n");
exit(-1);
}
unsigned long nBytes = nSize * sizeof(float); //单个向量字节数
//数据初始化,开辟主机锁页内存
// hostA = (float*)calloc(nSize,sizeof(float));
// hostB = (float*)calloc(nSize,sizeof(float));
// hostResult = (float*)calloc(nSize,sizeof(float));
CudaCall(cudaMallocHost((void**)&hostA,nBytes));
CudaCall(cudaMallocHost((void**)&hostB,nBytes));
CudaCall(cudaMallocHost((void**)&hostResult,nBytes));
CudaCall(cudaMallocHost((void**)&deviceResult,nBytes));
initData(hostA,hostB,nSize);
//串行计算
long cpuTime = serial(nSize);
//多GPU计算
float gpuTime = gpu_multi(deviceResult,nSize,nGpus);
printf("单个向量长度 2^%ld,单个显卡三个数组需要显存 %dMB,使用 %d个GPU,cpu串行耗时 %ldms,GPU并行数据传输和计算耗时 %fms,加速比: %f\n",\
leftBit,memSize,nGpus,cpuTime,gpuTime,cpuTime/gpuTime);
cudaFreeHost(hostA);
cudaFreeHost(hostB);
cudaFreeHost(hostResult);
cudaFreeHost(deviceResult);
return 0;
}

3. 测试脚本

#!/bin/bash
# 编译
nvcc pointMul.cu -o pointMul
dir=out
# 清空文件夹
> "$dir"
echo "start $(date)" >> out
# 串行计算
# for((i=0;i<4;i++)); do
# yhrun -N1 -n1 -pTH_GPU ./matrix_add2D 0 | tee -a "$dir"
# done
# 显卡数量
nGpus=(1 2 3 4)
# 数据规模 2^(S)
S=(24 28 30 31)
# gpu
for n in "${nGpus[@]}"; do
for s in "${S[@]}"; do
for((i=0;i<3;i++)); do
yhrun -N1 -n1 -pTH_GPU ./pointMul "$n" "$s" | tee -a "$dir"
done
done
done
echo "end $(date)" >> out

4. 测试数据

由于测试脚本的限制,CPU串行计算在GPU单卡(K80 12G显存)、双卡、四卡测试中分别跑了一轮,数据如下:

数据长度 gpu单卡(ms) gpu2个卡(ms) gpu4个卡(ms)
2^24 44.3 44.7 44
2^28 719.7 717.3 703.3
2^30 - 2988.7 2951.7
2^31 - - 5906.7

GPU测试耗时及加速比数据:

数据长度 gpu单卡(ms) gpu2个卡(ms) gpu4个卡(ms)
2^24(耗时/加速比) 27.9/1.6 17.7/2.5 17.1/2.6
2^28(耗时/加速比) 399.8/1.8 273.6/2.6 273.9/2.6
2^30(耗时/加速比) 显存不足 985/3.0 1056.3/2.8
2^31(耗时/加速比) 显存不足 显存不足 1942.4/3.0

5. 结果分析

(1)GPU比CPU计算有明显的性能提升,根据数据规模,数据量越大提升越明显。
(2)GPU数量越多,计算效率提升越高,数据规模越大,提升越明显。

posted @   安洛8  阅读(8)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂
点击右上角即可分享
微信分享提示