MPI 的简单使用
▶ 源代码。主机根结点生成随机数组,发布副本到各结点(例子用孩子使用了一个结点),分别使用 GPU 求平方根并求和,然后根结点使用 MPI 回收各节点的计算结果,规约求和后除以数组大小(相当于球随机数组中所有元素的平方根的平均值)。
1 // simpleMPI.h 2 extern "C" 3 { 4 void initData(float *data, int dataSize); 5 void computeGPU(float *hostData, int blockSize, int gridSize); 6 float sum(float *data, int size); 7 void my_abort(int err); 8 }
1 // simpleMPI.cu 2 #include <iostream> 3 #include <mpi.h> 4 #include "cuda_runtime.h" 5 #include "device_launch_parameters.h" 6 #include "simpleMPI.h" 7 8 using std::cout; 9 using std::cerr; 10 using std::endl; 11 12 #define CUDA_CHECK(call) \ 13 if((call) != cudaSuccess) \ 14 { \ 15 cudaError_t err = cudaGetLastError(); \ 16 cerr << "CUDA error calling \""#call"\", code is " << err << endl; \ 17 my_abort(err); \ 18 } 19 20 // GPU 计算平方根 21 __global__ void simpleMPIKernel(float *input, float *output) 22 { 23 int tid = blockIdx.x * blockDim.x + threadIdx.x; 24 output[tid] = sqrt(input[tid]); 25 } 26 27 // 初始化数组 28 void initData(float *data, int dataSize) 29 { 30 for (int i = 0; i < dataSize; i++) 31 data[i] = (float)rand() / RAND_MAX; 32 } 33 34 // 使用 GPU 进行计算的函数 35 void computeGPU(float *hostData, int blockSize, int gridSize) 36 { 37 int dataSize = blockSize * gridSize; 38 39 float *deviceInputData = NULL; 40 CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float))); 41 42 float *deviceOutputData = NULL; 43 CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float))); 44 45 CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice)); 46 47 simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData); 48 49 CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost)); 50 51 CUDA_CHECK(cudaFree(deviceInputData)); 52 CUDA_CHECK(cudaFree(deviceOutputData)); 53 } 54 55 // 简单的求和函数 56 float sum(float *data, int size) 57 { 58 float accum = 0.f; 59 for (int i = 0; i < size; i++) 60 accum += data[i]; 61 return accum; 62 } 63 64 // 中止函数 65 void my_abort(int err) 66 { 67 cout << "Test FAILED\n"; 68 MPI_Abort(MPI_COMM_WORLD, err); 69 }
1 // simpleMPI.cpp 2 #include <mpi.h> 3 #include <iostream> 4 #include "simpleMPI.h" 5 6 using std::cout; 7 using std::cerr; 8 using std::endl; 9 10 #define MPI_CHECK(call) if((call) != MPI_SUCCESS) { cerr << "MPI error calling \""#call"\"\n"; my_abort(-1); } 11 12 int main(int argc, char *argv[]) 13 { 14 int blockSize = 256; 15 int gridSize = 10000; 16 int dataSizePerNode = gridSize * blockSize; 17 18 // 初始化 MPI 19 MPI_CHECK(MPI_Init(&argc, &argv)); 20 21 // 获取节点尺寸和编号 22 int commSize, commRank; 23 MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize)); 24 MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank)); 25 26 // 根结点生成随机数组 27 int dataSizeTotal = dataSizePerNode * commSize; 28 float *dataRoot = NULL; 29 if (commRank == 0) 30 { 31 cout << "Running on " << commSize << " nodes" << endl; 32 dataRoot = new float[dataSizeTotal]; 33 initData(dataRoot, dataSizeTotal); 34 } 35 36 // 每个结点上申请数组用于接收根结点发来的数据 37 float *dataNode = new float[dataSizePerNode]; 38 39 MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD)); 40 41 // 清空根节点数据 42 if (commRank == 0) 43 delete [] dataRoot; 44 45 // 每个结点调用 GPU 计算平方根,然后规约到一个值 46 computeGPU(dataNode, blockSize, gridSize); 47 float sumNode = sum(dataNode, dataSizePerNode); 48 49 // 使用 MPI 接收每个结点的计算结果并进行规约 50 float sumRoot; 51 MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD)); 52 53 // 回收和输出工作 54 delete[] dataNode; 55 MPI_CHECK(MPI_Finalize()); 56 57 if (commRank == 0) 58 { 59 float average = sumRoot / dataSizeTotal; 60 cout << "Average of square roots is: " << average << endl; 61 cout << "PASSED\n"; 62 } 63 64 getchar(); 65 return 0; 66 }
▶ 输出结果
Running on 1 nodes Average of square roots is: 0.667507 PASSED
▶ 涨姿势
● 集中在 MPI 的几何函数的使用上,CUDA 部分没有新的认识。