高性能计算-优化-循环展开性能测试(4)

  1. 测试目标:测试for循环展开优化的提升效率
#include <cstdio>
#include <ctime>
#include <cstdlib>
/*
测试一维和二维循环展开效率区别
*/
#include <cstdio>
#include <ctime>
#include <cstdlib>
/*
测试一维和二维循环展开效率区别
*/
int main()
{
//初始化数组
double arr1[2000][2000] = {0};
double arr2[2000*2000] = {0};
for(int i=0;i<2000;i++)
{
for(int j=0;j<2000;j++)
{
arr1[i][j] = static_cast<float>(rand()) / RAND_MAX * 6.0f;
arr2[2000*i + j] = arr1[i][j];
}
}
clock_t start1,start2,start3,start4,end1,end2,end3,end4;
double sum1,sum2,sum3,sum4;
sum1 = sum2 = sum3 = sum4 = 0.0;
start1 = clock();
for(int i=0;i<2000;i++)
{
for(int j=0;j<2000 ;j++)
sum1 += arr1[i][j];
}
end1 = clock();
start2 = clock();
for(int i=0;i<2000*2000;i++)
{
sum2 += arr2[i];
}
end2 = clock();
start3 = clock();
for(int i=0;i<2000*2000;i+=4)
{
sum3 += arr2[i];
sum3 += arr2[i+1];
sum3 += arr2[i+2];
sum3 += arr2[i+3];
}
end3 = clock();
start4 = clock();
for(int i=0;i<2000;i++)
{
for(int j=0;j<2000 ;j++)
sum4 += arr2[i*2000 + j];
}
end4 = clock();
printf("2 dim arr sum1:%lf, cost:%lf s\n",sum1,(double)(end1-start1)/CLOCKS_PER_SEC);
printf("1 dim arr sum2:%lf, cost:%lf s\n",sum2,(double)(end2-start2)/CLOCKS_PER_SEC);
printf("1 dim arr sum3:%lf, 4x cost:%lf s\n",sum3,(double)(end3-start3)/CLOCKS_PER_SEC);
printf("1 dim arr sum4:%lf, double loop cost:%lf s\n",sum4,(double)(end4-start4)/CLOCKS_PER_SEC);
return 0;
}
  1. 测试数据
//4x O0优化
2 dim arr sum1:11999297.167960, cost:0.021406 s
1 dim arr sum2:11999297.167960, cost:0.020094 s
1 dim arr sum3:11999297.167960, 4x cost:0.022917 s
2 dim arr sum1:11999297.167960, cost:0.022119 s
1 dim arr sum2:11999297.167960, cost:0.021239 s
1 dim arr sum3:11999297.167960, 4x cost:0.023906 s
2 dim arr sum1:11999297.167960, cost:0.023547 s
1 dim arr sum2:11999297.167960, cost:0.021229 s
1 dim arr sum3:11999297.167960, 4x cost:0.021825 s
2 dim arr sum1:11999297.167960, cost:0.023784 s
1 dim arr sum2:11999297.167960, cost:0.021436 s
1 dim arr sum3:11999297.167960, 4x cost:0.021964 s
//2x O0优化
2 dim arr sum1:11999297.167960, cost:0.021334 s
1 dim arr sum2:11999297.167960, cost:0.019663 s
1 dim arr sum3:11999297.167960, 2x cost:0.021414 s
2 dim arr sum1:11999297.167960, cost:0.021309 s
1 dim arr sum2:11999297.167960, cost:0.019634 s
1 dim arr sum3:11999297.167960, 2x cost:0.021390 s
2 dim arr sum1:11999297.167960, cost:0.022325 s
1 dim arr sum2:11999297.167960, cost:0.021339 s
1 dim arr sum3:11999297.167960, 2x cost:0.021407 s
//4x O3优化
2 dim arr sum1:11999297.167960, cost:0.009233 s
1 dim arr sum2:11999297.167960, cost:0.008192 s
1 dim arr sum3:11999297.167960, 4x cost:0.008167 s
2 dim arr sum1:11999297.167960, cost:0.009286 s
1 dim arr sum2:11999297.167960, cost:0.008169 s
1 dim arr sum3:11999297.167960, 4x cost:0.008215 s
2 dim arr sum1:11999297.167960, cost:0.009352 s
1 dim arr sum2:11999297.167960, cost:0.008152 s
1 dim arr sum3:11999297.167960, 4x cost:0.008134 s
//4x Ofast优化
2 dim arr sum1:11999297.167960, cost:0.007150 s
1 dim arr sum2:11999297.167960, cost:0.005946 s
1 dim arr sum3:11999297.167960, 2x cost:0.005869 s
2 dim arr sum1:11999297.167960, cost:0.006816 s
1 dim arr sum2:11999297.167960, cost:0.005367 s
1 dim arr sum3:11999297.167960, 4x cost:0.005357 s
2 dim arr sum1:11999297.167960, cost:0.007004 s
1 dim arr sum2:11999297.167960, cost:0.005852 s
1 dim arr sum3:11999297.167960, 4x cost:0.005808 s
//4x O0 双层一维数组测试
2 dim arr sum1:11999297.167960, cost:0.022291 s
1 dim arr sum2:11999297.167960, cost:0.021354 s
1 dim arr sum3:11999297.167960, 4x cost:0.022473 s
1 dim arr sum4:11999297.167960, double loop cost:0.020899 s
2 dim arr sum1:11999297.167960, cost:0.022396 s
1 dim arr sum2:11999297.167960, cost:0.021465 s
1 dim arr sum3:11999297.167960, 4x cost:0.022987 s
1 dim arr sum4:11999297.167960, double loop cost:0.022999 s
2 dim arr sum1:11999297.167960, cost:0.021562 s
1 dim arr sum2:11999297.167960, cost:0.019639 s
1 dim arr sum3:11999297.167960, 4x cost:0.021876 s
1 dim arr sum4:11999297.167960, double loop cost:0.021314 s
//4x O3 双层一维数组测试
2 dim arr sum1:11999297.167960, cost:0.009570 s
1 dim arr sum2:11999297.167960, cost:0.008306 s
1 dim arr sum3:11999297.167960, 4x cost:0.008309 s
1 dim arr sum4:11999297.167960, double loop cost:0.008273 s
2 dim arr sum1:11999297.167960, cost:0.009535 s
1 dim arr sum2:11999297.167960, cost:0.008302 s
1 dim arr sum3:11999297.167960, 4x cost:0.008289 s
1 dim arr sum4:11999297.167960, double loop cost:0.008264 s
2 dim arr sum1:11999297.167960, cost:0.009337 s
1 dim arr sum2:11999297.167960, cost:0.008195 s
1 dim arr sum3:11999297.167960, 4x cost:0.008210 s
1 dim arr sum4:11999297.167960, double loop cost:0.008193 s
  1. 测试结论
    (1)循环展开中for循环条件转移指令的减少对性能提升并不明显,更大的意义在于减少循环内指令的计算效率,比如向量化优化。向量化优化可以通过编译器自动优化参数 O2、O3、Ofast实现或代码级循环展开实现(仍需要编译器优化参数),代码级提升约 0.2-0.3%。
    (2)双层循环与单层循环相同的数据,具有相同的循环数量,但是增加了循环条件分支指令数量,无编译优化性能下降约0.6-0.8%,O3优化下双层循环提升 0.4-0.6%。
posted @   安洛8  阅读(30)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂
点击右上角即可分享
微信分享提示