高性能计算-gemv-向量化优化(16)

1. 目标:矩阵向量乘法 y = A * x (列向量 = 矩阵 *列向量),进行串行,循环展开+simd, simd+omp的效率对比。

2. 源码

#include <iostream>
#include <ctime>
#include <arm_neon.h>
#include <omp.h>
using namespace std;
void dgemv(const int n, const int m, const double *const A,
const double *const x, double *y)
{
for (int i = 0; i < n; i++)
{
double y_tmp = 0.0;
for (int j = 0; j < m; j++)
{
y_tmp += A[i * m + j] * x[j];
}
y[i] = y_tmp;
}
return;
}
//循环展开 + neon
void dgemv_neon(const int n, const int m, const double *const A,
const double *const x, double *y)
{
for (int i = 0; i < n; i+=2)
{
float64x2_t vsum1 = {0.0};
float64x2_t vsum2 = {0.0};
for (int j = 0; j < m; j+=2)
{
float64x2_t vX = vld1q_f64(x+j);
float64x2_t vA1 = vld1q_f64(A+i*m+j);
float64x2_t vA2 = vld1q_f64(A+(i+1)*m+j);
vsum1 = vfmaq_f64(vsum1,vA1,vX);
vsum2 = vfmaq_f64(vsum2,vA2,vX);
}
y[i] = vaddvq_f64(vsum1);
y[i+1] = vaddvq_f64(vsum2);
}
return;
}
//循环展开 + neon + omp
void dgemv_neon_omp(const int n, const int m, const double *const A,
const double *const x, double *y)
{
omp_set_num_threads(4);
#pragma omp paralel for proc_bind(close)
for (int i = 0; i < n; i+=2)
{
float64x2_t vsum1 = {0.0};
float64x2_t vsum2 = {0.0};
for (int j = 0; j < m; j+=2)
{
float64x2_t vX = vld1q_f64(x+j);
float64x2_t vA1 = vld1q_f64(A+i*m+j);
float64x2_t vA2 = vld1q_f64(A+(i+1)*m+j);
vsum1 = vfmaq_f64(vsum1,vA1,vX);
vsum2 = vfmaq_f64(vsum2,vA2,vX);
}
y[i] = vaddvq_f64(vsum1);
y[i+1] = vaddvq_f64(vsum2);
#pragma omp barrier
}
return;
}
void printM(double* A,int n,int m)
{
for(int i=0;i<n;i++)
{
for(int j=0;j<m;j++)
printf("%.2f ",A[i*m+j]);
printf("\n");
}
}
int main(int argc,char** argv)
{
if(argc!=2)
{
printf("should parameter 0:serial 1:neon 2:neon+omp");
return 0;
}
int mode = atoi(argv[1]);
int nloop = 20;
int n = 10240;
double *mat_A = new double[n * n];
double *vec_x = new double[n];
double *vec_y = new double[n];
srand(0);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
vec_x[j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
mat_A[i * n + j] = (double)(rand() % 10) + (double)rand() / RAND_MAX * 3.14;
}
vec_y[i] = 0.0;
}
clock_t t = clock();
for (int iloop = 0; iloop < nloop; iloop++)
{
switch (mode)
{
case 0:
dgemv(n, n, mat_A, vec_x, vec_y);
break;
case 1:
dgemv_neon(n, n, mat_A, vec_x, vec_y);
break;
case 2:
dgemv_neon_omp(n, n, mat_A, vec_x, vec_y);
break;
default:
break;
}
}
t = clock() - t;
// printM(mat_A,n,n);
cout << "y[0]:" << vec_y[0] << endl;
cout << "y[1]:" << vec_y[1] << endl;
cout << "y[2]:" << vec_y[2] << endl;
cout << "y[3]:" << vec_y[3] << endl;
cout << "y[4]:" << vec_y[4] << endl;
cout << "gemv mode "<<mode <<" ave cost time(clock):" << t / nloop << endl;
}

3. 数据

g++ gemv.cpp -o gemv -fopenmp
g++ gemv.cpp -o gemv -fopenmp -O1

耗时为clock_t时间单位

函数\编译优化设置 O0无优化 O1无附加参数优化
串行 721297 235374
循环展开+SIMD 759981 173499
循环展开+SIMD+OMP 690216 170610

4. 结果分析

(1). O0优化耗时差别不大

(2). O1优化下,循环展开+SIMD 加速比为 1.35

(3). neon经测试效率没有明显提升

posted @   安洛8  阅读(70)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂
点击右上角即可分享
微信分享提示