1. 目标: 矩阵A(MK) 矩阵B(KN)相乘,计算结果 C(M*N);本测试将使用不同的数据分块方式、MPI通信接口、数据循环模型,测试通信及计算效率,计算耗时为程序用户态和核心态的占用cpu时间之和。
问题1: 如何对数据分块,初始化本进程数据。有以下两种模型
模型一. 一维分块,对A行分块,对B列分块。示例函数为 gemm_1 gemm_2 gemm_2loop gemm_3
模型二. 二维分块,K方向分块数相等,M N方向分块也相等。示例函数为 gemm_cannon
问题2: 一组需要彼此交换数据做计算的进程,数据交换模型有两种:
模型一. 遍历ID: 本进程的数据在本进程永久保存,遍历目标进程ID,本进程数据依次与其他进程收发收据。
根据进程id大小生序排序并遍历,全部数据完成交换步进次数大于 n。如下0 1 2 3四个进程,要经历7个步进次数才能完成通信。示例函数为 gemm_1 gemm_2
行号为进程id,列为步进时间,模拟每次循环能完成哪些ID的数据交互
目标ID |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
0 |
0 |
1 |
2 |
3 |
|
|
|
1 |
|
0 |
|
2 |
3 |
|
|
2 |
|
|
0 |
1 |
|
3 |
|
3 |
|
|
|
0 |
1 |
2 |
3 |
模型二. 数据循环: 每个进程收发数据ID不变,进行数据循环。当前进程向前发送数据向后接收数据,实现各个
进程数据在各个进程之间的流转。对每个进程设置上下收发两个缓冲区,规定向上发送数据,向下接收数据,
本次接受的收据,作为下次发送和计算数据。0进程的上一个进程为尾进程,尾进程的下一个进程为0进程。
每个进程计算 n 轮,通信 n-1 轮。
示例函数为 gemm_2loop gemm_3 gemm_cannon gemm_cannon2
问题3: 如果分块网格非正方形,每个进程保存的 A B分块位置是不对应的,需要做分块对齐,可以对矩阵转置后分块网格对齐和初始化,
无论是否方阵,进程默认的初始化A B分块在K方向是没有对齐的。有以下两种模型
模型一. 转置对齐: 对其中一个矩阵转置,保证在K方向对齐,然后进行数据循环。示例函数为 gemm_cannon
模型二. 循环移动对齐: 每个进程只计算自己的数据,对于网格分块 C[i][j],A[i][]行的分块数据向左循环,B[][j]列的分块数据做向上循环,
A[i][j]块初始化为 A[i][(j+i)%lda],B 初始化为 B[(i+j)%K][j],
通信时 A分块做左右分块数据循环,B分块做上下分块数据循环。对每个分块计算结果累加。示例函数为 gemm_cannon2
2. 测试代码
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <math.h> |
| #include <mpi.h> |
| #include <sys/time.h> |
| #include <sys/resource.h> |
| |
| |
| |
| void gemm_1(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np); |
| void gemm_2(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np); |
| void gemm_2loop(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np); |
| void gemm_3(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np); |
| void gemm_cannon(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np); |
| void gemm_cannon2(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np); |
| void print_matrix(float* matrix, int rows, int cols); |
| void main(int argc,char**argv) |
| { |
| if(argc!=4) |
| { |
| printf("param1: int指定测试函数(gemm_1:0; gemm_2:1 gemm_2loop:2; gemm_3:3; gemm_cannon:4; gemm_cannon2:5)\n"); |
| printf("param2: int方阵行维度数\n"); |
| printf("param3: int进程数\n"); |
| return; |
| } |
| int func; |
| int N; |
| int np; |
| func = atoi(argv[1]); |
| N = atoi(argv[2]); |
| np = atoi(argv[3]); |
| |
| float* A = calloc(N*N,sizeof(float)); |
| float* B = calloc(N*N,sizeof(float)); |
| float* C = calloc(N*N,sizeof(float)); |
| |
| |
| for (int i = 0; i < N; ++i) |
| { |
| for (int j = 0; j < N; ++j) |
| { |
| A[i*N+j]= i+j; |
| B[i*N+j]= i+j; |
| } |
| } |
| switch (func) |
| { |
| case 0: |
| gemm_1((float*)A,N,(float*)B,N,(float*)C,N,N,N,N,np); |
| break; |
| case 1: |
| gemm_2((float*)A,N,(float*)B,N,(float*)C,N,N,N,N,np); |
| break; |
| case 2: |
| gemm_2loop((float*)A,N,(float*)B,N,(float*)C,N,N,N,N,np); |
| break; |
| case 3: |
| gemm_3((float*)A,N,(float*)B,N,(float*)C,N,N,N,N,np); |
| break; |
| case 4: |
| gemm_cannon((float*)A,N,(float*)B,N,(float*)C,N,N,N,N,np); |
| break; |
| case 5: |
| gemm_cannon2((float*)A,N,(float*)B,N,(float*)C,N,N,N,N,np); |
| break; |
| default: |
| break; |
| } |
| free(A); |
| free(B); |
| free(C); |
| } |
| |
| void print_matrix(float* matrix, int rows, int cols) |
| { |
| printf("matrix:\n"); |
| for (size_t i = 0; i < rows; i++) |
| { |
| for (size_t j = 0; j < cols; j++) |
| printf("%.2f\t",matrix[i*cols+j]); |
| printf("\n"); |
| } |
| } |
| |
| void RequestFree(int count,MPI_Request arr_request[]) |
| { |
| for(int i=0;i<count;i++) |
| MPI_Request_free(&arr_request[i]); |
| } |
| |
| long sum(float* C,int n) |
| { |
| long sum=0; |
| for(int i=0;i<n*n;i++) |
| sum+=C[i]; |
| return sum; |
| } |
3. 矩阵一维行列分块,mpi 阻塞收发分离接口版本 MPI_Recv/MPI_Send。思路:数据通信使用使用遍历ID。
| void gemm_1(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np) |
| { |
| int size; |
| int pid; |
| int nRb = N/np; |
| float blockA[nRb][K]; |
| float blockAR[nRb][K]; |
| int nCb = nRb; |
| float blockB[K][nCb]; |
| float blockC[M][ldc]; |
| memset(blockC,0,sizeof(float)*ldc*M); |
| MPI_Status status; |
| struct rusage rustart,ruend; |
| float usec=0.0; |
| float ssec=0.0; |
| float sec=0.0; |
| |
| |
| MPI_Init(NULL,NULL); |
| MPI_Comm_size(MPI_COMM_WORLD,&size); |
| MPI_Comm_rank(MPI_COMM_WORLD,&pid); |
| if(size==np) |
| { |
| |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<K;j++) |
| blockA[i][j] = A[(pid*nRb+i)*lda+j]; |
| } |
| |
| for(int i=0;i<K;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockB[i][j] = B[i*ldb+pid*nCb+j]; |
| } |
| |
| getrusage(RUSAGE_SELF,&rustart); |
| for(int n=0;n<np;n++) |
| { |
| if(n!=pid) |
| { |
| |
| |
| |
| |
| |
| |
| if(n>pid) |
| { |
| for(int i=0;i<nRb;i++) |
| { |
| |
| MPI_Recv(blockAR[i],K,MPI_FLOAT,n,0,MPI_COMM_WORLD,&status); |
| MPI_Send(blockA[i],K,MPI_FLOAT,n,0,MPI_COMM_WORLD); |
| } |
| } |
| else |
| { |
| for(int i=0;i<nRb;i++) |
| { |
| |
| MPI_Send(blockA[i],K,MPI_FLOAT,n,0,MPI_COMM_WORLD); |
| MPI_Recv(blockAR[i],K,MPI_FLOAT,n,0,MPI_COMM_WORLD,&status); |
| } |
| } |
| } |
| else |
| memcpy(blockAR,blockA,sizeof(float)*nRb*K); |
| |
| for(int k=0;k<K;k++) |
| { |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockC[n*nRb+i][pid*nCb+j] += blockAR[i][k] *blockB[k][j]; |
| } |
| } |
| } |
| } |
| else |
| printf("shoule parameter: -n %d\n",np); |
| |
| MPI_Reduce(blockC,C,M*ldc,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); |
| getrusage(RUSAGE_SELF,&ruend); |
| usec = ruend.ru_utime.tv_sec-rustart.ru_utime.tv_sec+(ruend.ru_utime.tv_usec-rustart.ru_utime.tv_usec)/1e6; |
| ssec = ruend.ru_stime.tv_sec-rustart.ru_stime.tv_sec+(ruend.ru_stime.tv_usec-rustart.ru_stime.tv_usec)/1e6; |
| sec = usec+ssec; |
| |
| |
| if(pid==0) |
| { |
| printf("func %s N %d np %d sum %ld useTimeAll %f\n",__func__,N,np,sum(C,N),sec); |
| |
| |
| } |
| MPI_Finalize(); |
| } |
4. 矩阵行列一维分块,mpi 非阻塞接口版本 MPI_Irecv/MPI_Isend,无收发顺序限制。思路:数据通信使用遍历ID。
| void gemm_2(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np) |
| { |
| int size; |
| int pid; |
| int nRb = N/np; |
| float blockA[nRb][K]; |
| float blockAR[nRb][K]; |
| float blockACal[nRb][K]; |
| int nCb = nRb; |
| float blockB[K][nCb]; |
| float blockC[M][ldc]; |
| memset(blockC,0,sizeof(float)*ldc*M); |
| MPI_Request request[2]; |
| struct rusage rustart,ruend; |
| float usec=0.0; |
| float ssec=0.0; |
| float sec=0.0; |
| |
| MPI_Init(NULL,NULL); |
| MPI_Comm_size(MPI_COMM_WORLD,&size); |
| MPI_Comm_rank(MPI_COMM_WORLD,&pid); |
| if(size==np) |
| { |
| |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<K;j++) |
| blockA[i][j] = A[(pid*nRb+i)*lda+j]; |
| } |
| |
| for(int i=0;i<K;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockB[i][j] = B[i*ldb+pid*nCb+j]; |
| } |
| |
| getrusage(RUSAGE_SELF,&rustart); |
| for(int n=0;n<np;n++) |
| { |
| |
| if(n!=pid) |
| { |
| MPI_Irecv(blockAR,K*nRb,MPI_FLOAT,n,0,MPI_COMM_WORLD,&request[0]); |
| MPI_Isend(blockA,K*nRb,MPI_FLOAT,n,0,MPI_COMM_WORLD,&request[1]); |
| MPI_Waitall(2,request,MPI_STATUSES_IGNORE); |
| } |
| else |
| memcpy(blockAR,blockA,sizeof(float)*nRb*K); |
| |
| for(int k=0;k<K;k++) |
| { |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockC[n*nRb+i][pid*nCb+j] += blockAR[i][k] *blockB[k][j]; |
| } |
| } |
| } |
| } |
| else |
| printf("shoule parameter: -n %d\n",np); |
| MPI_Reduce(blockC,C,M*ldc,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); |
| getrusage(RUSAGE_SELF,&ruend); |
| usec = ruend.ru_utime.tv_sec-rustart.ru_utime.tv_sec+(ruend.ru_utime.tv_usec-rustart.ru_utime.tv_usec)/1e6; |
| ssec = ruend.ru_stime.tv_sec-rustart.ru_stime.tv_sec+(ruend.ru_stime.tv_usec-rustart.ru_stime.tv_usec)/1e6; |
| sec = usec+ssec; |
| |
| if(pid==0) |
| { |
| printf("func %s N %d np %d sum %ld useTimeAll %f\n",__func__,N,np,sum(C,N),sec); |
| |
| |
| } |
| |
| MPI_Finalize(); |
| } |
5. 矩阵行列一维分块,mpi 非阻塞接口版本 MPI_Irecv/MPI_Isend,无收发顺序限制,通信计算重叠。思路:数据通信使用数据循环,优化点:nB个进程,nB-1次数据通信即可。
| void gemm_2loop(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np) |
| { |
| int size; |
| int pid; |
| int nRb = N/np; |
| float blockA[nRb][K]; |
| float blockAR[nRb][K]; |
| int nCb = nRb; |
| float blockB[K][nCb]; |
| float blockC[M][ldc]; |
| memset(blockC,0,sizeof(float)*ldc*M); |
| MPI_Request request[2]; |
| struct rusage rustart,ruend; |
| float usec=0.0; |
| float ssec=0.0; |
| float sec=0.0; |
| |
| MPI_Init(NULL,NULL); |
| MPI_Comm_size(MPI_COMM_WORLD,&size); |
| MPI_Comm_rank(MPI_COMM_WORLD,&pid); |
| if(size==np) |
| { |
| |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<K;j++) |
| blockA[i][j] = A[(pid*nRb+i)*lda+j]; |
| } |
| |
| for(int i=0;i<K;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockB[i][j] = B[i*ldb+pid*nCb+j]; |
| } |
| |
| int up = (pid==0)?np-1:pid-1; |
| int down = (pid==np-1)?0:pid+1; |
| |
| getrusage(RUSAGE_SELF,&rustart); |
| for(int n=0;n<np;n++) |
| { |
| |
| if(n<np-1) |
| { |
| MPI_Isend(blockA,K*nRb,MPI_FLOAT,up,0,MPI_COMM_WORLD,&request[0]); |
| MPI_Irecv(blockAR,K*nRb,MPI_FLOAT,down,0,MPI_COMM_WORLD,&request[1]); |
| } |
| |
| for(int k=0;k<K;k++) |
| { |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockC[(pid+n)%np *nRb+i][pid*nCb+j] += blockA[i][k] *blockB[k][j]; |
| } |
| } |
| |
| if(n<np-1) |
| { |
| MPI_Waitall(2,request,MPI_STATUSES_IGNORE); |
| memcpy(blockA,blockAR,sizeof(float)*nRb*K); |
| } |
| } |
| } |
| else |
| printf("shoule parameter: -n %d\n",np); |
| MPI_Reduce(blockC,C,M*ldc,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); |
| getrusage(RUSAGE_SELF,&ruend); |
| usec = ruend.ru_utime.tv_sec-rustart.ru_utime.tv_sec+(ruend.ru_utime.tv_usec-rustart.ru_utime.tv_usec)/1e6; |
| ssec = ruend.ru_stime.tv_sec-rustart.ru_stime.tv_sec+(ruend.ru_stime.tv_usec-rustart.ru_stime.tv_usec)/1e6; |
| sec = usec+ssec; |
| |
| if(pid==0) |
| { |
| printf("func %s N %d np %d sum %ld useTimeAll %f\n",__func__,N,np,sum(C,N),sec); |
| |
| |
| } |
| MPI_Finalize(); |
| } |
6. 矩阵行列一维分块,mpi 重复非阻塞接口版本 MPI_SEND_INIT/MPI_START/MPI_WAIT。思路:数据通信使用方法二,优化点:nB个进程,nB-1次数据通信即可。
| void gemm_3(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np) |
| { |
| int size; |
| int pid; |
| int nRb = N/np; |
| float blockA[nRb][K]; |
| float blockAR[nRb][K]; |
| int nCb = nRb; |
| float blockB[K][nCb]; |
| float blockC[M][ldc]; |
| memset(blockC,0,sizeof(float)*ldc*M); |
| MPI_Request request[2]; |
| MPI_Status status[2] ={0}; |
| struct rusage rustart,ruend; |
| float usec=0.0; |
| float ssec=0.0; |
| float sec=0.0; |
| |
| MPI_Init(NULL,NULL); |
| MPI_Comm_size(MPI_COMM_WORLD,&size); |
| MPI_Comm_rank(MPI_COMM_WORLD,&pid); |
| if(size==np) |
| { |
| |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<K;j++) |
| blockA[i][j] = A[(pid*nRb+i)*lda+j]; |
| } |
| |
| for(int i=0;i<K;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockB[i][j] = B[i*ldb+pid*nCb+j]; |
| } |
| |
| int up = (pid==0)?np-1:pid-1; |
| int down = (pid==np-1)?0:pid+1; |
| |
| MPI_Send_init(blockA,K*nRb,MPI_FLOAT,up,0,MPI_COMM_WORLD,&request[0]); |
| MPI_Recv_init(blockAR,K*nRb,MPI_FLOAT,down,0,MPI_COMM_WORLD,&request[1]); |
| |
| getrusage(RUSAGE_SELF,&rustart); |
| for(int n=0;n<np;n++) |
| { |
| |
| if(n<np-1) |
| MPI_Startall(2,request); |
| |
| for(int k=0;k<K;k++) |
| { |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockC[(pid+n)%np *nRb+i][pid*nCb+j] += blockA[i][k] *blockB[k][j]; |
| } |
| } |
| if(n<np-1) |
| { |
| MPI_Waitall(2,request,MPI_STATUSES_IGNORE); |
| memcpy(blockA,blockAR,sizeof(float)*nRb*K); |
| } |
| } |
| } |
| else |
| printf("shoule parameter: -n %d\n",np); |
| MPI_Reduce(blockC,C,M*ldc,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); |
| getrusage(RUSAGE_SELF,&ruend); |
| usec = ruend.ru_utime.tv_sec-rustart.ru_utime.tv_sec+(ruend.ru_utime.tv_usec-rustart.ru_utime.tv_usec)/1e6; |
| ssec = ruend.ru_stime.tv_sec-rustart.ru_stime.tv_sec+(ruend.ru_stime.tv_usec-rustart.ru_stime.tv_usec)/1e6; |
| sec = usec+ssec; |
| |
| if(pid==0) |
| { |
| printf("func %s N %d np %d sum %ld useTimeAll %f\n",__func__,N,np,sum(C,N),sec); |
| |
| |
| } |
| RequestFree(2,request); |
| MPI_Finalize(); |
| } |
7. 矩阵行列二维分块,笛卡尔坐标,mpi 重复非阻塞接口版本 MPI_SEND_INIT/MPI_START/MPI_WAIT。思路:本进程初始化自己的数据块,C[i][j]块起始初始化本地数据为 A[i][j] B[i][j]。数据通信使用使用遍历ID。
计算: 要点,要清块要与哪些块进程计算.A列方向一维分块A[i],B行方向一维分块B[i],A[i] B[i]再分别进行列分块,A[i] B[i] 内的分块相互计算,最后对所有进程的结果求和
难点: A[i]列中pid分组与需要通信的 B[i]行中的pid分组,pid不重叠会造成通信死锁
解决办法:对 B[i] 转置后通信,可以保证收发池中 pid可以完全覆盖命中,B[i]转置之后,对应的计算公式要修改为块内按列,元素彼此相乘。转置适合非方阵 A B 分块不对齐的情况。
| void gemm_cannon(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np) |
| { |
| int size; |
| int pid; |
| int nRb = (int)N/sqrt(np); |
| int nCb = nRb; |
| float blockA[nRb][nCb]; |
| float blockB[nCb][nRb]; |
| float blockBR[nCb][nRb]; |
| float blockC[M][ldc]; |
| memset(blockC,0,sizeof(float)*ldc*M); |
| MPI_Request request[2*(ldb/nCb)]; |
| MPI_Status status[2] ={0}; |
| struct rusage rustart,ruend; |
| float usec=0.0; |
| float ssec=0.0; |
| float sec=0.0; |
| int dims[2]={M/nRb,K/nCb}; |
| int coords[2]={0}; |
| int periods[2]={0,0}; |
| MPI_Comm cartcomm; |
| |
| |
| MPI_Init(NULL,NULL); |
| MPI_Comm_size(MPI_COMM_WORLD,&size); |
| MPI_Comm_rank(MPI_COMM_WORLD,&pid); |
| if(size==np) |
| { |
| MPI_Cart_create(MPI_COMM_WORLD,2,dims,periods,0,&cartcomm); |
| MPI_Cart_coords(cartcomm,pid,2,coords); |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockA[i][j] = A[(coords[0]*nRb+i)*lda+coords[1]*nCb+j]; |
| } |
| for (int i = 0; i < nRb; i++) |
| { |
| |
| |
| for (int j = 0; j < nCb; j++) |
| blockB[j][i] = B[(coords[1]*nRb+i)*ldb+coords[0]*nCb+j]; |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| for (int n = 0; n < ldb/nCb; n++) |
| { |
| int dst = lda/nCb*n+coords[1]; |
| dst = (dst==pid)?MPI_PROC_NULL:dst; |
| MPI_Send_init(blockB,nRb*nCb,MPI_FLOAT,dst,0,MPI_COMM_WORLD,&request[2*n]); |
| MPI_Recv_init(blockBR,nRb*nCb,MPI_FLOAT,dst,0,MPI_COMM_WORLD,&request[2*n+1]); |
| } |
| getrusage(RUSAGE_SELF,&rustart); |
| for(int n=0;n<ldb/nCb;n++) |
| { |
| int dst = lda/nCb*n+coords[1]; |
| dst = (dst==pid)?MPI_PROC_NULL:dst; |
| |
| if(dst == MPI_PROC_NULL) |
| memcpy(blockBR,blockB,sizeof(float)*nRb*nCb); |
| else |
| { |
| MPI_Startall(2,&request[2*n]); |
| MPI_Waitall(2,&request[2*n],status); |
| } |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| { |
| for (int k = 0; k < nCb; k++) |
| blockC[coords[0]*nRb+i][n*nCb+j] += blockA[i][k] * blockBR[j][k]; |
| } |
| } |
| } |
| } |
| else |
| printf("shoule parameter: -n %d\n",np); |
| MPI_Reduce(blockC,C,M*ldc,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); |
| getrusage(RUSAGE_SELF,&ruend); |
| usec = ruend.ru_utime.tv_sec-rustart.ru_utime.tv_sec+(ruend.ru_utime.tv_usec-rustart.ru_utime.tv_usec)/1e6; |
| ssec = ruend.ru_stime.tv_sec-rustart.ru_stime.tv_sec+(ruend.ru_stime.tv_usec-rustart.ru_stime.tv_usec)/1e6; |
| sec = usec+ssec; |
| |
| |
| if(pid==0) |
| { |
| printf("func %s N %d np %d sum %ld useTimeAll %f\n",__func__,N,np,sum(C,N),sec); |
| |
| |
| } |
| RequestFree(2*(K/nCb),request); |
| MPI_Finalize(); |
| } |
8. 矩阵行列二维分块,笛卡尔坐标,mpi 重复非阻塞接口版本 MPI_SEND_INIT/MPI_START/MPI_WAIT。数据通信使用使用数据循环。数据块对齐使用循环移动对齐。数据通信使用数据循环。
| void gemm_cannon2(float* A, int lda, float* B, int ldb, float* C, int ldc,int M,int K,int N,int np) |
| { |
| int size; |
| int pid; |
| int nRb = (int)N/sqrt(np); |
| int nCb = nRb; |
| float blockA[nRb][nCb]; |
| float blockB[nRb][nCb]; |
| float blockAR[nRb][nCb]; |
| float blockBR[nRb][nCb]; |
| float blockC[M][ldc]; |
| memset(blockC,0,sizeof(float)*ldc*M); |
| MPI_Request request[2*2]; |
| MPI_Status status[2*2] ={0}; |
| struct rusage rustart,ruend1,ruend2; |
| float usec=0.0; |
| float ssec=0.0; |
| float sec=0.0; |
| float sec1=0.0; |
| int dims[2]={M/nRb,K/nCb}; |
| int coords[2]={0}; |
| int periods[2]={0,0}; |
| MPI_Comm cartcomm; |
| int rtag = 0; |
| int ctag = 1; |
| |
| |
| MPI_Init(NULL,NULL); |
| MPI_Comm_size(MPI_COMM_WORLD,&size); |
| MPI_Comm_rank(MPI_COMM_WORLD,&pid); |
| if(size==np) |
| { |
| MPI_Cart_create(MPI_COMM_WORLD,2,dims,periods,0,&cartcomm); |
| MPI_Cart_coords(cartcomm,pid,2,coords); |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nCb;j++) |
| blockA[i][j] = A[(coords[0]*nRb+i)*lda + (coords[1] + coords[0])%(lda/nCb)*nCb +j]; |
| } |
| |
| for (int i = 0; i < nRb; i++) |
| { |
| for (int j = 0; j < nCb; j++) |
| blockB[i][j] = B[((coords[0] +coords[1])%(M/nRb)*nCb+i)*ldb + coords[1]*nCb+j]; |
| } |
| |
| int left,right,up,down; |
| MPI_Cart_shift(cartcomm,1,1,&left,&right); |
| MPI_Cart_shift(cartcomm,0,1,&up,&down); |
| left = coords[1]==0?lda/nCb*(coords[0]+1)-1:left; |
| right = coords[1]==K/nRb-1?lda/nCb*coords[0]:right; |
| up = coords[0]==0?np-lda/nCb+coords[1]:up; |
| down = coords[0]==K/nRb-1?(np+coords[1])%np:down; |
| |
| |
| |
| |
| MPI_Send_init(blockA,nRb*nCb,MPI_FLOAT,left,rtag,MPI_COMM_WORLD,&request[0]); |
| MPI_Recv_init(blockAR,nRb*nCb,MPI_FLOAT,right,rtag,MPI_COMM_WORLD,&request[1]); |
| |
| MPI_Send_init(blockB,nRb*nCb,MPI_FLOAT,up,ctag,MPI_COMM_WORLD,&request[2]); |
| MPI_Recv_init(blockBR,nRb*nCb,MPI_FLOAT,down,ctag,MPI_COMM_WORLD,&request[3]); |
| getrusage(RUSAGE_SELF,&rustart); |
| for(int n=0;n<lda/nCb;n++) |
| { |
| if(n<(lda/nCb-1)) |
| MPI_Startall(4,request); |
| |
| for(int i=0;i<nRb;i++) |
| { |
| for(int j=0;j<nRb;j++) |
| { |
| for (int k = 0; k < nCb; k++) |
| blockC[coords[0]*nRb+i][coords[1]*nCb+j] += blockA[i][k]*blockB[k][j]; |
| } |
| } |
| if(n<(lda/nCb-1)) |
| { |
| MPI_Waitall(4,request,status); |
| memcpy(blockA,blockAR,sizeof(float)*nRb*nCb); |
| memcpy(blockB,blockBR,sizeof(float)*nRb*nCb); |
| } |
| } |
| } |
| else |
| printf("shoule parameter: -n %d\n",np); |
| getrusage(RUSAGE_SELF,&ruend1); |
| MPI_Reduce(blockC,C,M*ldc,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); |
| getrusage(RUSAGE_SELF,&ruend2); |
| usec = ruend2.ru_utime.tv_sec-rustart.ru_utime.tv_sec+(ruend2.ru_utime.tv_usec-rustart.ru_utime.tv_usec)/1e6; |
| ssec = ruend2.ru_stime.tv_sec-rustart.ru_stime.tv_sec+(ruend2.ru_stime.tv_usec-rustart.ru_stime.tv_usec)/1e6; |
| sec = usec+ssec; |
| usec = ruend2.ru_utime.tv_sec-ruend1.ru_utime.tv_sec+(ruend2.ru_utime.tv_usec-ruend1.ru_utime.tv_usec)/1e6; |
| ssec = ruend2.ru_stime.tv_sec-ruend1.ru_stime.tv_sec+(ruend2.ru_stime.tv_usec-ruend1.ru_stime.tv_usec)/1e6; |
| sec1 = usec+ssec; |
| |
| |
| MPI_Barrier(MPI_COMM_WORLD); |
| if(pid==0) |
| { |
| printf("func %s N %d np %d sum %ld useTimeAll %f reduceTime %f\n",__func__,N,np,sum(C,N),sec,sec1); |
| |
| |
| } |
| RequestFree(2*2,request); |
| MPI_Finalize(); |
| } |
9. 测试脚本:对每个函数测试参数执行4次
| #!/bin/bash |
| |
| mpicc gemm.c -o gemm -lm |
| |
| echo "start $(date)" >> out |
| echo "compile final." >> out |
| echo "run start" >> out |
| |
| |
| |
| func1=(0 1 2 3) |
| |
| N1=(16 32 64 128 256 512 1024) |
| |
| np1=(2 4 8 16 32 64) |
| for f in "${func1[@]}"; do |
| for N in "${N1[@]}"; do |
| for np in "${np1[@]}"; do |
| if [ "$N" -ge "$np" ]; then |
| |
| for ((i=1;i<=4;i++)); do |
| mpirun -n "$np" ./gemm "$f" "$N" "$np" | tee -a out |
| done |
| fi |
| done |
| done |
| done |
| |
| |
| |
| func2=(4 5) |
| |
| N2=(64 128 256 512 1024 2048) |
| |
| np2=($(seq 2 64)) |
| for f in "${func2[@]}"; do |
| for N in "${N2[@]}"; do |
| for np in "${np2[@]}"; do |
| |
| num=$((N * N / np)) |
| |
| sqrt=$(echo "sqrt($num)" | bc) |
| sqrt_f=${sqrt%.*} |
| |
| |
| |
| if [ $((sqrt_f * sqrt_f)) -eq $num ] && [ $((sqrt_f % 4)) -eq 0 ]; then |
| |
| for ((i=1;i<=4;i++)); do |
| |
| mpirun -n "$np" ./gemm "$f" "$N" "$np" | tee -a out |
| done |
| fi |
| done |
| done |
| done |
| |
| echo "run end" >> out |
| echo "start $(date)" >> out |
10. 测试命令
11. 测试数据处理
a. 对每组参数的4个值剔除最大值和最小值,再求均值,消除异常数据。操作为
| 数组X 元素数量为 n |
| M = max(X) |
| m = min(X) |
| X' = X \ {M,m} |
b. 然后根据剔除数据前后的均值波动情况计算置信度,公式如下
12. 数据处理后展示
数据分析:gemm-mpi性能测试,np为进程数,N为方阵列数 |
FIELD2 |
FIELD3 |
FIELD4 |
FIELD5 |
FIELD6 |
FIELD7 |
FIELD8 |
FIELD9 |
gemm_1 一维分块,阻塞收发分离,遍历ID通信 |
np\N |
16 |
32 |
64 |
128 |
256 |
512 |
1024 |
|
2 |
0.001545 |
0.001634 |
0.006977 |
0.026222 |
0.191657 |
1.515164 |
11.369472 |
|
4 |
0.004836 |
0.004293 |
0.009377 |
0.024945 |
0.137354 |
0.959660 |
6.966370 |
|
8 |
0.007710 |
0.013356 |
0.013878 |
0.030420 |
0.106415 |
0.557896 |
4.018535 |
|
16 |
0.032612 |
0.024304 |
0.060477 |
0.043809 |
0.109555 |
0.443862 |
2.228010 |
|
32 |
|
0.150871 |
0.128805 |
0.132738 |
0.163810 |
0.684472 |
1.649284 |
|
64 |
|
|
1.094653 |
1.303649 |
1.450869 |
2.193258 |
4.093855 |
gemm_2 一维分块,非阻塞,遍历ID通信 |
np\N |
16 |
32 |
64 |
128 |
256 |
512 |
1024 |
|
2 |
0.000905 |
0.001036 |
0.003665 |
0.023131 |
0.172192 |
1.369514 |
11.175500 |
|
4 |
0.002645 |
0.002272 |
0.003960 |
0.016218 |
0.105236 |
0.803488 |
6.890222 |
|
8 |
0.005941 |
0.006732 |
0.007395 |
0.015422 |
0.077109 |
0.462515 |
3.636947 |
|
16 |
0.026545 |
0.021659 |
0.020293 |
0.023825 |
0.061243 |
0.447386 |
1.923664 |
|
32 |
|
0.069077 |
0.069710 |
0.072841 |
0.097416 |
0.307438 |
1.412197 |
|
64 |
|
|
0.394969 |
0.777508 |
0.884014 |
0.604297 |
1.771925 |
gemm_2loop 非阻塞循环,数据循环 |
np\N |
16 |
32 |
64 |
128 |
256 |
512 |
1024 |
|
2 |
0.000673 |
0.001030 |
0.002680 |
0.016098 |
0.130159 |
1.021306 |
8.148343 |
|
4 |
0.001068 |
0.001795 |
0.002234 |
0.009042 |
0.066684 |
0.530850 |
4.345813 |
|
8 |
0.001933 |
0.004593 |
0.002888 |
0.006573 |
0.037661 |
0.273103 |
2.238982 |
|
16 |
0.005567 |
0.006254 |
0.005355 |
0.008131 |
0.023581 |
0.148000 |
1.214175 |
|
32 |
|
0.012959 |
0.019243 |
0.017772 |
0.026740 |
0.676488 |
0.661177 |
|
64 |
|
|
0.075113 |
0.104431 |
0.148001 |
0.672817 |
0.701280 |
gemm_3 一维分块,循环非阻塞,数据循环 |
np\N |
16 |
32 |
64 |
128 |
256 |
512 |
1024 |
|
2 |
0.000843 |
0.000905 |
0.002784 |
0.016305 |
0.130404 |
1.050064 |
8.231962 |
|
4 |
0.001050 |
0.001429 |
0.002877 |
0.009200 |
0.068053 |
0.532743 |
4.271506 |
|
8 |
0.003859 |
0.003621 |
0.004517 |
0.007998 |
0.038354 |
0.286389 |
2.213302 |
|
16 |
0.003490 |
0.010043 |
0.006398 |
0.008243 |
0.023873 |
0.154672 |
1.148860 |
|
32 |
|
0.017765 |
0.021530 |
0.013125 |
0.037160 |
0.091697 |
0.656130 |
|
64 |
|
|
0.115553 |
0.124888 |
0.179614 |
0.197108 |
0.778555 |
gemm_cannon1 循环非阻塞,遍历ID通信 |
np\N |
|
|
64 |
128 |
256 |
512 |
1024 |
|
4 |
|
|
0.001640 |
0.011581 |
0.097141 |
0.743693 |
5.857058 |
|
16 |
|
|
0.004127 |
0.007358 |
0.033916 |
0.232204 |
1.791188 |
|
64 |
|
|
0.086333 |
0.102424 |
0.135290 |
0.175451 |
1.192103 |
gemm_cannon2 循环非阻塞,数据循环 |
np\N |
|
|
64 |
128 |
256 |
512 |
1024 |
|
4 |
|
|
0.001433 |
0.008870 |
0.073905 |
0.673483 |
7.716168 |
|
16 |
|
|
0.006797 |
0.007814 |
0.024610 |
0.162873 |
1.594037 |
|
64 |
|
|
0.078948 |
0.074702 |
0.097038 |
0.155490 |
0.742144 |
13 结论
结论分析:
- 矩阵维度在64及以下,无论一维还是二维分块,越小的并行数量能获取越高的性能。原因分析:较小的计算量,并行的通信的效率对整体效率影响大。
- 矩阵维度在128-1024时,先是随着并行数量增加,能显著提升效率。分析:此时并行计算的效率提升远大于通信增加的影响。矩阵维度在128-1024时,并行数量在16-32效率最高,随着并行数量增加通信逐渐带来负面影响。
- 数据交换模型中每个进程固定收发ID,对数据进行循环交换,能明显提高通信效率。
- 非阻塞比阻塞通信有明显的效率优势。
- 二维网格分块比一维分块拥有绝对的效率优势。
待优化的问题:
- 实验机只有64 个CPU核心,未做测试更多并行参数的测试。
- 测试参数只测试了分块矩阵维度为4倍数的情况,没有更细粒度的测试。
- 测试矩阵只支持方阵。
- MPI_Init 不支持在一个程序中多次初始化,无法在测试程序中处理大量的的耗时数据。目前的做法是在excle中做数据处理。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂