1. 目标
| 设计一个程序,使用OpenMP并行化实现矩阵乘法。给定两个矩阵 A 和 B,矩阵大小均为1024*1024,你的任务是计算它们的乘积 C。 |
| 要求: |
| (1)、使用循环结构体的知识点,包括for循环体并行化、变量规约属性与子句reduction、循环调度策略与子句schedule以及嵌套循环与子句collapse。 |
| (2)、实现OpenMP并行化以加速矩阵乘法的计算。 |
| (3)、考虑内存一致性,确保数据在并行计算过程中的正确性。 |
| (4)、可选:实现线程亲核性,将线程绑定到特定的CPU核心上执行。 |
2. 测试简介
使用对比串行和并行,及 collapse 和 reduction 的效率。
3. 测试代码
| #include <stdio.h> |
| #include <omp.h> |
| #include <sys/time.h> |
| #include <stdlib.h> |
| |
| void gemm_1(int* A,int* B,long* C,int N,int NT); |
| void gemm_2(int* A,int* B,long* C,int N,int NT); |
| void gemm_3(int* A,int* B,long* C,int N,int NT); |
| void gemm_4(int* A,int* B,long* C,int N,int NT); |
| void gemm_5(int* A,int* B,long* C,int N,int NT); |
| void gemm_6(int* A,int* B,long* C,int N,int NT); |
| void gemm_7(int* A,int* B,long* C,int N,int NT); |
| void gemm_8(int* A,int* B,long* C,int N,int NT); |
| void main(int argc,char** argv) |
| { |
| if(argc!=4 && argc!=3) |
| { |
| printf("shoule param: ./exe [(int)para1:select func] [(int)para2: dim] [(int)para3 thread_nums]\n"); |
| return; |
| } |
| int func=1; |
| int N=12; |
| int NT=1; |
| if(argc==3) |
| { |
| func = atoi(argv[1]); |
| N = atoi(argv[2]); |
| } |
| else if(argc==4) |
| { |
| func = atoi(argv[1]); |
| N = atoi(argv[2]); |
| NT = atoi(argv[3]); |
| } |
| |
| int r,c,k; |
| long sum=0; |
| int* A = calloc(N*N,sizeof(int)); |
| int* B = calloc(N*N,sizeof(int)); |
| long* C = calloc(N*N,sizeof(long)); |
| |
| for(int r=0;r<N;r++) |
| { |
| for(int c=0;c<N;c++) |
| A[r*N+c] = B[r*N+c] = r+c; |
| } |
| |
| switch (func) |
| { |
| case 1: |
| NT = 1; |
| gemm_1(A,B,C,N,NT); |
| break; |
| case 2: |
| gemm_2(A,B,C,N,NT); |
| break; |
| case 3: |
| gemm_3(A,B,C,N,NT); |
| break; |
| case 4: |
| NT = 1; |
| gemm_4(A,B,C,N,NT); |
| break; |
| case 5: |
| |
| break; |
| case 6: |
| NT = 1; |
| gemm_6(A,B,C,N,NT); |
| break; |
| case 7: |
| gemm_7(A,B,C,N,NT); |
| break; |
| case 8: |
| gemm_8(A,B,C,N,NT); |
| break; |
| default: |
| break; |
| } |
| |
| free(A); |
| free(B); |
| free(C); |
| } |
| |
| long sum(long* C,int n) |
| { |
| long sum=0; |
| for(int i=0;i<n*n;i++) |
| sum+=C[i]; |
| return sum; |
| } |
| |
| void print_matrix(long* matrix, int rows, int cols) |
| { |
| printf("matrix:\n"); |
| for (size_t i = 0; i < rows; i++) |
| { |
| for (size_t j = 0; j < cols; j++) |
| printf("%ld\t",matrix[i*cols+j]); |
| printf("\n"); |
| } |
| } |
| |
| void print_matrixI(int* matrix, int rows, int cols) |
| { |
| printf("matrix:\n"); |
| for (size_t i = 0; i < rows; i++) |
| { |
| for (size_t j = 0; j < cols; j++) |
| printf("%d\t",matrix[i*cols+j]); |
| printf("\n"); |
| } |
| } |
(1) 矩阵不分块 串行
| void gemm_1(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| for(r=0;r<N;r++) |
| { |
| for(c=0;c<N;c++) |
| { |
| |
| long sum=0; |
| for(k=0;k<N;k++) |
| sum += A[r*N+k] * B[k*N+c]; |
| C[r*N+c] = sum; |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
(2) 矩阵不分块 并行 collapse(2)
| void gemm_2(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| |
| gettimeofday(&start,NULL); |
| omp_set_num_threads(NT); |
| |
| |
| #pragma omp parallel for collapse(2) schedule(guided) proc_bind(close) |
| for(int r=0;r<N;r++) |
| { |
| for(int c=0;c<N;c++) |
| { |
| #if 0 |
| for(int k=0;k<N;k++) |
| { |
| #pragma omp critical |
| C[r*N+c] += A[r*N+k] * B[k*N+c]; |
| } |
| #else |
| long sum=0; |
| for(int k=0;k<N;k++) |
| sum += A[r*N+k] * B[k*N+c]; |
| C[r*N+c] = sum; |
| #endif |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| |
| |
| |
| } |
(3) 矩阵不分块 并行 collapse(2)+ reduction
| void gemm_3(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| omp_set_num_threads(NT); |
| #pragma omp parallel for collapse(2) schedule(guided) proc_bind(close) |
| for(r=0;r<N;r++) |
| { |
| for(c=0;c<N;c++) |
| { |
| |
| long sum=0; |
| #pragma omp parallel for reduction(+:sum) schedule(guided) proc_bind(close) |
| for(k=0;k<N;k++) |
| sum += A[r*N+k] * B[k*N+c]; |
| C[r*N+c] = sum; |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
(4) 矩阵一维行列分块 A块 4N,B块 N4,串行,K在外层
| void gemm_4(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| for(r=0;r<N;r+=4) |
| { |
| for(c=0;c<N;c+=4) |
| { |
| for(int k=0;k<N;k++) |
| { |
| for(int nr=0;nr<4;nr++) |
| { |
| for(int nc=0;nc<4;nc++) |
| C[(r+nr)*N+c+nc] += A[(r+nr)*N+k] * B[k*N+c+nc]; |
| } |
| } |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
(5) 矩阵一维行列分块 A块 4N,B块 N4,并行,K在外层,collapse。2层外 3层内循环,无法 collapse(5)。
| void gemm_5(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| omp_set_num_threads(NT); |
| #pragma omp parallel for collapse(5) schedule(guided) proc_bind(close) |
| for(r=0;r<N;r+=4) |
| { |
| for(c=0;c<N;c+=4) |
| { |
| for(int k=0;k<N;k++) |
| { |
| for(int nr=0;nr<4;nr++) |
| { |
| for(int nc=0;nc<4;nc++) |
| { |
| C[(r+nr)*N+c+nc] += A[(r+nr)*N+k] * B[k*N+c+nc]; |
| } |
| } |
| } |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
(6) 矩阵一维行列分块 A块 4N,B块 N4,串行,K在内层。
| void gemm_6(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| for(r=0;r<N;r+=4) |
| { |
| for(c=0;c<N;c+=4) |
| { |
| for(int nr=0;nr<4;nr++) |
| { |
| for(int nc=0;nc<4;nc++) |
| { |
| long sum =0; |
| for(int k=0;k<N;k++) |
| sum += A[(r+nr)*N+k] * B[k*N+c+nc]; |
| C[(r+nr)*N+c+nc] = sum; |
| } |
| } |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
(7) 矩阵一维行列分块 A块 4N,B块 N4,并行,K在内层,collapse(2 + 3)
| void gemm_7(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| omp_set_num_threads(NT); |
| #pragma omp parallel for collapse(2) schedule(guided) proc_bind(close) |
| for(r=0;r<N;r+=4) |
| { |
| for(c=0;c<N;c+=4) |
| { |
| #pragma omp parallel for collapse(3) schedule(guided) proc_bind(close) |
| for(int nr=0;nr<4;nr++) |
| { |
| for(int nc=0;nc<4;nc++) |
| { |
| for(int k=0;k<N;k++) |
| C[(r+nr)*N+c+nc] += A[(r+nr)*N+k] * B[k*N+c+nc]; |
| } |
| } |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
(8) 矩阵一维行列分块 A块 4N,B块 N4,并行,K在内层,collapse (2+2)+ reduction
| void gemm_8(int* A,int* B,long* C,int N,int NT) |
| { |
| struct timeval start,end; |
| float time; |
| int r,c,k; |
| |
| gettimeofday(&start,NULL); |
| omp_set_num_threads(NT); |
| #pragma omp parallel for collapse(2) private(r,c) schedule(guided) proc_bind(close) |
| for(r=0;r<N;r+=4) |
| { |
| for(c=0;c<N;c+=4) |
| { |
| #pragma omp parallel for collapse(2) schedule(guided) proc_bind(close) |
| for(int nr=0;nr<4;nr++) |
| { |
| for(int nc=0;nc<4;nc++) |
| { |
| long sum =0; |
| #pragma omp parallel for reduction(+:sum) schedule(guided) proc_bind(close) |
| for(int k=0;k<N;k++) |
| sum += A[(r+nr)*N+k] * B[k*N+c+nc]; |
| C[(r+nr)*N+c+nc] = sum; |
| } |
| } |
| } |
| } |
| gettimeofday(&end,NULL); |
| time = end.tv_sec-start.tv_sec+(end.tv_usec-start.tv_usec)/1e6; |
| printf("func %s N %d threads_num %d sum %ld useSec %f\n",__func__,N,NT,sum(C,N),time); |
| |
| } |
4. 测试数据
串行耗时(秒) |
FIELD2 |
FIELD3 |
FIELD4 |
FIELD5 |
函数名 |
线程数\维度 |
128 |
512 |
1024 |
gemm_1 矩阵不分块 串行 |
1 |
0.01786 |
1.52600 |
47.10980 |
gemm_4 矩阵一维行列分块 A块 4N,B块 N4,串行,K在外层 |
1 |
0.03303 |
2.13628 |
22.47993 |
gemm-6 矩阵一维行列分块 A块 4N,B块 N4,串行,K在内层, |
1 |
0.02078 |
1.64609 |
65.69423 |
并行耗时(秒) |
|
|
|
|
函数名 |
线程数\维度 |
128 |
512 |
1024 |
gemm_2 矩阵不分块 并行 collapse(2) |
4 |
0.00515 |
0.36207 |
13.16278 |
|
8 |
0.00303 |
0.18024 |
6.34872 |
|
16 |
0.00382 |
0.12891 |
3.39717 |
|
32 |
0.00471 |
0.10166 |
2.02269 |
gemm_3 矩阵不分块 并行 collapse(2) + reduction |
4 |
0.01050 |
0.54038 |
11.69852 |
|
8 |
0.00593 |
0.23172 |
6.51897 |
|
16 |
0.00807 |
0.16333 |
3.60325 |
|
32 |
0.01048 |
0.12297 |
2.12352 |
gemm_7 矩阵一维行列分块 A块 4N,B块 N4,并行,K在内层,collapse(2+3) |
4 |
0.00893 |
0.71235 |
18.41659 |
|
8 |
0.00509 |
0.33909 |
8.78864 |
|
16 |
0.00803 |
0.23421 |
4.56388 |
|
32 |
0.01073 |
0.15445 |
2.65277 |
gemm_8 矩阵一维行列分块 A块 4N,B块 N4,并行,K在内层,collapse(2+2) + reduction |
4 |
0.01120 |
0.58372 |
16.70111 |
|
8 |
0.00622 |
0.26817 |
7.68263 |
|
16 |
0.00756 |
0.14903 |
4.24847 |
|
32 |
0.01052 |
0.13294 |
2.71985 |
5. 结果分析
(1). 关于线程数:较小的数据规模应用较低的线程数,否则线程创建销毁影响效率。较大的数据规模应采用更大的线程数量。总之数据规模应该与线程数量匹配才能效率较大。
(2). 关于reduction:效率没有开启向量化优化 O1及以上有效(测试了1024维度4/32线程加速明显),在问题规模较小(512维度及以下)即使没有采用向量化优化也比 reduction 效率高。
(3). 关于 collapse 和 reduction:测试了 collapse(2+3) 和 collapse(2+2) + reduction,在数据规模较小及线程数少(128维度和8线程及以下)全collapse效率高,除此之外随着数据规模和线程数增加,collapse 的任务规模数量级增加,效率不如 reduction。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?