高性能计算:OpenMP多进程实现并行向量乘法
平台 win10 g++
g++和gcc中自带openmp的库。直接引入头文件,#include "omp.h" 使用编译参数gcc -fopenmp 即可,g++ -openmp。另外加入了-o2的优化。
主要对比验证了直接做矩阵相乘;先转置再相乘(提高缓存命中率),分块相乘,OpenMP并行,OpenMP分块并行
void serial_multiplication(); 最基础的串行实现,通过三重for循环
void serial_multiplication_trans(); 通过先转置矩阵B,在做三重for循环
void serial_multiplication_block(); 串行实现,加入块棋盘划分
void parallel_multiplication(); 使用OpenMP的并行实现
void parallel_multiplication_block(); 使用OpenMP的并行实现,加入的块棋盘分块
点击查看代码
#include "chrono"
#include "cstdlib"
#include "ctime"
#include "omp.h"
#include "random"
#include <iostream>
using namespace std;
using namespace std::chrono;
#define M 4096
#define N 4096
#define ThreadNumber 2
#define SM 2046
double matrix_a[M][N];
double matrix_b[N][M];
double result[M][M];
double temp[M][M];
// double result[M][M] __attribute__ ((aligned (64)));
void init_matrix();
void serial_multiplication();
void serial_multiplication_trans();
void serial_multiplication_block();
void parallel_multiplication();
void parallel_multiplication_block();
int main()
{
init_matrix();
auto start = system_clock::now();
serial_multiplication();
auto end = system_clock::now();
auto duration = duration_cast<microseconds>(end - start);
cout << "serial multiplication takes "
<< double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << endl;
init_matrix();
start = system_clock::now();
serial_multiplication_trans();
end = system_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "serial_multiplication_trans takes "
<< double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << endl;
init_matrix();
start = system_clock::now();
serial_multiplication_block();
end = system_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "serial_multiplication_block takes "
<< double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << endl;
init_matrix();
start = system_clock::now();
parallel_multiplication();
end = system_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "parallel multiplication takes takes "
<< double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << endl;
init_matrix();
start = system_clock::now();
parallel_multiplication_block();
end = system_clock::now();
duration = duration_cast<microseconds>(end - start);
cout << "parallel_multiplication_block takes "
<< double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << endl;
cout << "Finished";
return 0;
}
//generate the same matrix everytime
void init_matrix()
{
default_random_engine engine;
uniform_real_distribution<double> u(0.0, 1.0);
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < N; ++j)
{
matrix_a[i][j] = u(engine);
result[i][j] = 0;
}
}
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < M; ++j)
{
matrix_b[i][j] = u(engine);
}
}
}
void serial_multiplication()
{
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < M; ++j)
{
for (int k = 0; k < N; ++k)
{
result[i][j] += matrix_a[i][k] * matrix_b[k][j];
}
}
}
}
void serial_multiplication_trans()
{
for (int i = 0; i < M; i++)
{
for (int j = 0; j < M; j++)
{
temp[i][j] = matrix_b[j][i];
}
}
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < M; ++j)
{
for (int k = 0; k < N; ++k)
{
result[i][j] += matrix_a[i][k] * matrix_b[j][k];
}
}
}
}
void serial_multiplication_block()
{
double *rres;
double *rmul1;
double *rmul2;
int i2, k2, j2, i, j, k;
for (i = 0; i < N; i += SM)
for (j = 0; j < N; j += SM)
for (k = 0; k < N; k += SM)
for (i2 = 0, rres = &result[i][j], rmul1 = &matrix_a[i][k];
i2 < SM;
++i2, rres += N, rmul1 += N)
for (k2 = 0, rmul2 = &matrix_b[k][j];
k2 < SM; ++k2, rmul2 += N)
for (j2 = 0; j2 < SM; ++j2)
rres[j2] += rmul1[k2] * rmul2[j2];
}
void parallel_multiplication_block()
{
double *rres;
double *rmul1;
double *rmul2;
int i2, k2, j2, i, j, k;
#pragma omp parallel for shared(result) schedule(static, SM) num_threads(ThreadNumber)
for (i = 0; i < N; i += SM)
#pragma omp parallel for shared(result) schedule(static, SM) num_threads(ThreadNumber)
for (j = 0; j < N; j += SM)
for (k = 0; k < N; k += SM)
for (i2 = 0, rres = &result[i][j], rmul1 = &matrix_a[i][k];
i2 < SM;
++i2, rres += N, rmul1 += N)
for (k2 = 0, rmul2 = &matrix_b[k][j];
k2 < SM; ++k2, rmul2 += N)
for (j2 = 0; j2 < SM; ++j2)
rres[j2] += rmul1[k2] * rmul2[j2];
}
void parallel_multiplication()
{
#pragma omp parallel for num_threads(ThreadNumber*ThreadNumber)
for (int i = 0; i < M; ++i)
{
for (int j = 0; j < M; ++j)
{
double temp = 0;
for (int k = 0; k < N; ++k)
{
temp += matrix_a[i][k] * matrix_b[k][j];
}
result[i][j] = temp;
}
}
}
N=2046 线程4
分析:
根据实验结果来看,通过对B转置的方法,大大地提高了缓存命中率,从而显著地提高了运行速度。 在各种问题规模下,提高缓存命中率可以加快约一倍的速度。
在各种问题规模下,分块的方法要远远快于基本方法,但是分块方法仅仅比转置方法快一点点,原因是在分块也只是通过提高缓存命中率的方式加速矩阵乘法的速度。
另外,并行算法相较串行的有一定优势,但是并没有达到与进程数直接相关的程度,所以。因为并行算法在实现中有各种复杂的原因。