- 测试目标:测试for循环展开优化的提升效率
| #include <cstdio> |
| #include <ctime> |
| #include <cstdlib> |
| |
| |
| |
| |
| #include <cstdio> |
| #include <ctime> |
| #include <cstdlib> |
| |
| |
| |
| |
| int main() |
| { |
| |
| double arr1[2000][2000] = {0}; |
| double arr2[2000*2000] = {0}; |
| for(int i=0;i<2000;i++) |
| { |
| for(int j=0;j<2000;j++) |
| { |
| arr1[i][j] = static_cast<float>(rand()) / RAND_MAX * 6.0f; |
| arr2[2000*i + j] = arr1[i][j]; |
| } |
| } |
| clock_t start1,start2,start3,start4,end1,end2,end3,end4; |
| double sum1,sum2,sum3,sum4; |
| sum1 = sum2 = sum3 = sum4 = 0.0; |
| |
| start1 = clock(); |
| for(int i=0;i<2000;i++) |
| { |
| for(int j=0;j<2000 ;j++) |
| sum1 += arr1[i][j]; |
| } |
| end1 = clock(); |
| start2 = clock(); |
| for(int i=0;i<2000*2000;i++) |
| { |
| sum2 += arr2[i]; |
| } |
| end2 = clock(); |
| start3 = clock(); |
| for(int i=0;i<2000*2000;i+=4) |
| { |
| sum3 += arr2[i]; |
| sum3 += arr2[i+1]; |
| sum3 += arr2[i+2]; |
| sum3 += arr2[i+3]; |
| } |
| end3 = clock(); |
| start4 = clock(); |
| for(int i=0;i<2000;i++) |
| { |
| for(int j=0;j<2000 ;j++) |
| sum4 += arr2[i*2000 + j]; |
| } |
| end4 = clock(); |
| printf("2 dim arr sum1:%lf, cost:%lf s\n",sum1,(double)(end1-start1)/CLOCKS_PER_SEC); |
| printf("1 dim arr sum2:%lf, cost:%lf s\n",sum2,(double)(end2-start2)/CLOCKS_PER_SEC); |
| printf("1 dim arr sum3:%lf, 4x cost:%lf s\n",sum3,(double)(end3-start3)/CLOCKS_PER_SEC); |
| printf("1 dim arr sum4:%lf, double loop cost:%lf s\n",sum4,(double)(end4-start4)/CLOCKS_PER_SEC); |
| return 0; |
| } |
- 测试数据
| |
| 2 dim arr sum1:11999297.167960, cost:0.021406 s |
| 1 dim arr sum2:11999297.167960, cost:0.020094 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.022917 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.022119 s |
| 1 dim arr sum2:11999297.167960, cost:0.021239 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.023906 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.023547 s |
| 1 dim arr sum2:11999297.167960, cost:0.021229 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.021825 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.023784 s |
| 1 dim arr sum2:11999297.167960, cost:0.021436 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.021964 s |
| |
| |
| 2 dim arr sum1:11999297.167960, cost:0.021334 s |
| 1 dim arr sum2:11999297.167960, cost:0.019663 s |
| 1 dim arr sum3:11999297.167960, 2x cost:0.021414 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.021309 s |
| 1 dim arr sum2:11999297.167960, cost:0.019634 s |
| 1 dim arr sum3:11999297.167960, 2x cost:0.021390 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.022325 s |
| 1 dim arr sum2:11999297.167960, cost:0.021339 s |
| 1 dim arr sum3:11999297.167960, 2x cost:0.021407 s |
| |
| |
| 2 dim arr sum1:11999297.167960, cost:0.009233 s |
| 1 dim arr sum2:11999297.167960, cost:0.008192 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.008167 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.009286 s |
| 1 dim arr sum2:11999297.167960, cost:0.008169 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.008215 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.009352 s |
| 1 dim arr sum2:11999297.167960, cost:0.008152 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.008134 s |
| |
| |
| 2 dim arr sum1:11999297.167960, cost:0.007150 s |
| 1 dim arr sum2:11999297.167960, cost:0.005946 s |
| 1 dim arr sum3:11999297.167960, 2x cost:0.005869 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.006816 s |
| 1 dim arr sum2:11999297.167960, cost:0.005367 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.005357 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.007004 s |
| 1 dim arr sum2:11999297.167960, cost:0.005852 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.005808 s |
| |
| |
| 2 dim arr sum1:11999297.167960, cost:0.022291 s |
| 1 dim arr sum2:11999297.167960, cost:0.021354 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.022473 s |
| 1 dim arr sum4:11999297.167960, double loop cost:0.020899 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.022396 s |
| 1 dim arr sum2:11999297.167960, cost:0.021465 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.022987 s |
| 1 dim arr sum4:11999297.167960, double loop cost:0.022999 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.021562 s |
| 1 dim arr sum2:11999297.167960, cost:0.019639 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.021876 s |
| 1 dim arr sum4:11999297.167960, double loop cost:0.021314 s |
| |
| |
| 2 dim arr sum1:11999297.167960, cost:0.009570 s |
| 1 dim arr sum2:11999297.167960, cost:0.008306 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.008309 s |
| 1 dim arr sum4:11999297.167960, double loop cost:0.008273 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.009535 s |
| 1 dim arr sum2:11999297.167960, cost:0.008302 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.008289 s |
| 1 dim arr sum4:11999297.167960, double loop cost:0.008264 s |
| |
| 2 dim arr sum1:11999297.167960, cost:0.009337 s |
| 1 dim arr sum2:11999297.167960, cost:0.008195 s |
| 1 dim arr sum3:11999297.167960, 4x cost:0.008210 s |
| 1 dim arr sum4:11999297.167960, double loop cost:0.008193 s |
| |
- 测试结论
(1)循环展开中for循环条件转移指令的减少对性能提升并不明显,更大的意义在于减少循环内指令的计算效率,比如向量化优化。向量化优化可以通过编译器自动优化参数 O2、O3、Ofast实现或代码级循环展开实现(仍需要编译器优化参数),代码级提升约 0.2-0.3%。
(2)双层循环与单层循环相同的数据,具有相同的循环数量,但是增加了循环条件分支指令数量,无编译优化性能下降约0.6-0.8%,O3优化下双层循环提升 0.4-0.6%。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂