1. 目标:矩阵向量乘法 y = A * x (列向量 = 矩阵 *列向量),进行串行,循环展开+simd, simd+omp的效率对比。
2. 源码
| #include <iostream> |
| #include <ctime> |
| #include <arm_neon.h> |
| #include <omp.h> |
| using namespace std; |
| |
| |
| void dgemv(const int n, const int m, const double *const A, |
| const double *const x, double *y) |
| { |
| for (int i = 0; i < n; i++) |
| { |
| double y_tmp = 0.0; |
| for (int j = 0; j < m; j++) |
| { |
| y_tmp += A[i * m + j] * x[j]; |
| } |
| y[i] = y_tmp; |
| } |
| return; |
| } |
| |
| |
| void dgemv_neon(const int n, const int m, const double *const A, |
| const double *const x, double *y) |
| { |
| for (int i = 0; i < n; i+=2) |
| { |
| float64x2_t vsum1 = {0.0}; |
| float64x2_t vsum2 = {0.0}; |
| for (int j = 0; j < m; j+=2) |
| { |
| float64x2_t vX = vld1q_f64(x+j); |
| float64x2_t vA1 = vld1q_f64(A+i*m+j); |
| float64x2_t vA2 = vld1q_f64(A+(i+1)*m+j); |
| vsum1 = vfmaq_f64(vsum1,vA1,vX); |
| vsum2 = vfmaq_f64(vsum2,vA2,vX); |
| } |
| y[i] = vaddvq_f64(vsum1); |
| y[i+1] = vaddvq_f64(vsum2); |
| } |
| return; |
| } |
| |
| |
| void dgemv_neon_omp(const int n, const int m, const double *const A, |
| const double *const x, double *y) |
| { |
| omp_set_num_threads(4); |
| #pragma omp paralel for proc_bind(close) |
| for (int i = 0; i < n; i+=2) |
| { |
| float64x2_t vsum1 = {0.0}; |
| float64x2_t vsum2 = {0.0}; |
| for (int j = 0; j < m; j+=2) |
| { |
| float64x2_t vX = vld1q_f64(x+j); |
| float64x2_t vA1 = vld1q_f64(A+i*m+j); |
| float64x2_t vA2 = vld1q_f64(A+(i+1)*m+j); |
| vsum1 = vfmaq_f64(vsum1,vA1,vX); |
| vsum2 = vfmaq_f64(vsum2,vA2,vX); |
| } |
| y[i] = vaddvq_f64(vsum1); |
| y[i+1] = vaddvq_f64(vsum2); |
| #pragma omp barrier |
| } |
| return; |
| } |
| |
| void printM(double* A,int n,int m) |
| { |
| for(int i=0;i<n;i++) |
| { |
| for(int j=0;j<m;j++) |
| printf("%.2f ",A[i*m+j]); |
| printf("\n"); |
| } |
| } |
| |
| int main(int argc,char** argv) |
| { |
| if(argc!=2) |
| { |
| printf("should parameter 0:serial 1:neon 2:neon+omp"); |
| return 0; |
| } |
| int mode = atoi(argv[1]); |
| int nloop = 20; |
| int n = 10240; |
| double *mat_A = new double[n * n]; |
| double *vec_x = new double[n]; |
| double *vec_y = new double[n]; |
| |
| srand(0); |
| for (int i = 0; i < n; i++) |
| { |
| for (int j = 0; j < n; j++) |
| { |
| vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14; |
| mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14; |
| } |
| vec_y[i] = 0.0; |
| } |
| |
| clock_t t = clock(); |
| for (int iloop = 0; iloop < nloop; iloop++) |
| { |
| switch (mode) |
| { |
| case 0: |
| dgemv(n, n, mat_A, vec_x, vec_y); |
| break; |
| case 1: |
| dgemv_neon(n, n, mat_A, vec_x, vec_y); |
| break; |
| case 2: |
| dgemv_neon_omp(n, n, mat_A, vec_x, vec_y); |
| break; |
| default: |
| break; |
| } |
| } |
| t = clock() - t; |
| |
| |
| cout << "y[0]:" << vec_y[0] << endl; |
| cout << "y[1]:" << vec_y[1] << endl; |
| cout << "y[2]:" << vec_y[2] << endl; |
| cout << "y[3]:" << vec_y[3] << endl; |
| cout << "y[4]:" << vec_y[4] << endl; |
| |
| cout << "gemv mode "<<mode <<" ave cost time(clock):" << t / nloop << endl; |
| } |
3. 数据
| g++ gemv.cpp -o gemv -fopenmp |
| g++ gemv.cpp -o gemv -fopenmp -O1 |
耗时为clock_t时间单位
函数\编译优化设置 |
O0无优化 |
O1无附加参数优化 |
串行 |
721297 |
235374 |
循环展开+SIMD |
759981 |
173499 |
循环展开+SIMD+OMP |
690216 |
170610 |
4. 结果分析
(1). O0优化耗时差别不大
(2). O1优化下,循环展开+SIMD 加速比为 1.35
(3). neon经测试效率没有明显提升
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂