编译器优化性能初步比较
OS:Windows XP 32 bit
CPU:Intel Mobile Core 2 Duo T6600
一、混合四则运算
main.c
#include <stdio.h> #include <time.h> int main() { int i,j,a=1,b=1; float c=1.0,d=1.0; double e=1.0,f=1.0; double start, finish, duration; start=clock(); for (i = 0; i < 1000; i++) { for (j = 0; j < 1000000; j++) { a = a + 50; b = a - 100; a = b * 20; c = a + 300.89; d = c - 600.89; c = d * 90.89; d = c / 55.89; e = c * 90.89; f = e / 55.89; } } finish=clock(); duration=finish-start; printf("%f,%f\n",e,f); printf("%10e",duration); return 0; }
O1 O2 O3(Ox) 优化集合(无快速浮点优化) 优化集合 VS2008 C/C++ Compiler 10.015 9.530 9.530 2.734 1.968 gcc4.4.4 10.250 10.250 10.265 7.203 5.328 gcc4.5.1 10.390 10.375 10.969 6.156 4.265 Intel C/C++ Compiler 11.1 9.375 9.343 9.343 9.015 8.843
优化集合为
VS2008 C/C++ Compiler /Ox /Ob2 /Og /Oi /Ot /Oy /fp:fast /arch:SSE2 gcc4.4.4
gcc4.5.1-O3 -ftracer -fivopts -ftree-loop-linear -ftree-vectorize -fforce-addr -fomit-frame-pointer -fno-bounds-check -funroll-loops -ffast-math -march=native -mfpmath=sse -mmmx -msse -msse2 -msse3 Intel C/C++ Compiler 11.1 /fast /O3 /Ot /Og /Oi /Qipo /QxHost /arch:SSE3 /Qunroll /Qvec /Quse-intel-optimized-headers /Qparallel /fp:fast=2 /Ob2 /GT /GA
二、三角函数
main.c(来源于Intel官方)
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #define INTEG_FUNC(x) abs(sin(x)) int main(void) { unsigned int i, j, N; double step, x_i, sum; double start, finish, duration; double interval_begin = 0.0; double interval_end = 2.0 * 3.141592653589793238; start = clock(); printf(" \n"); printf(" Number of | Computed Integral | \n"); printf(" Interior Points | | \n"); for (j=2;j<27;j++) { printf("------------------------------------- \n"); N = 1 << j; step = (interval_end - interval_begin) / N; sum = INTEG_FUNC(interval_begin) * step / 2.0; for (i=1;i<N;i++) { x_i = i * step; sum += INTEG_FUNC(x_i) * step; } sum += INTEG_FUNC(interval_end) * step / 2.0; printf(" %10d | %14e | \n", N, sum); } finish = clock(); duration = (finish - start); printf(" \n"); printf(" Application Clocks = %10e \n", duration); printf(" \n"); }耗时比较(单位:秒)
O1 O2 O3(Ox) 优化集合(无快速浮点优化) 优化集合 VS2008 C/C++ Compiler 9.687 9.343 8.734 8.281 6.843 gcc4.4.4 20.219 20.296 20.593 15.062 15.046 gcc4.5.1 20.125 19.953 20.094 15.000 15.187 Intel C/C++ Compiler 11.1 6.640 4.828 4.828 4.812 4.812
优化集合同上
三、OpenMP测试
prime.cpp
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> int main(int argc, char *argv[]) { int i; int start, end; int number_of_primes=0; int number_of_41primes=0; int number_of_43primes=0; double s1,s2; start = 1; end = 40000000; printf("Range to check for Primes: %d - %d\n\n",start, end); s1=clock(); #pragma omp parallel for schedule(dynamic,100) reduction(+:number_of_primes,number_of_41primes,number_of_43primes) for (i = start; i <= end; i += 2) { int limit, j, prime; limit = (int) sqrt((float)i) + 1; prime = 1; j = 3; while (prime && (j <= limit)) { if (i%j == 0) prime = 0; j += 2; } if (prime) { number_of_primes++; if (i%4 == 1) number_of_41primes++; if (i%4 == 3) number_of_43primes++; } } s2=clock(); printf("\n%10e\n",s2-s1); printf("\nProgram Done.\n %d primes found\n",number_of_primes); printf("\nNumber of 4n+1 primes found: %d\n",number_of_41primes); printf("\nNumber of 4n-1 primes found: %d\n",number_of_43primes); return 0; }采用优化集合+OpenMP参数
其中,VS2008为/openmp,gcc为-fopenmp,intel compiler为/Qopenmp。
VS2008 C/C++ Compiler 16.781 gcc4.4.4 16.828 gcc4.5.1 15.672 Intel C/C++ Compiler 11.1 16.703
四、Fortran Compiler测试
Fortran编译器和以上的结果类似,除了VS2008(不支持Fortran),
gfortran在普通计算上和intel compiler相差很少,
只是在三角函数运算上落后较多。
linpk标准测试
代码来源:http://www.polyhedron.com/compare0html
O1 O2 O3 优化集合(无快速浮点优化) 优化集合 gfortran4.4.4 25.109 24.938 25.172 24.846 24.922 gfortran4.5.1 24.375 24.313 24.203 24.063 24.234 Intel Fortran Compiler 11.1 25.813 25.188 25.016 25.484 25.203
矩阵相乘测试(内置函数)
main.f90
program main implicit none real(kind = 8) :: A(2000, 2000), B(2000, 2000), C(2000, 2000) real(kind = 8) :: time_begin, time_end CALL RANDOM_SEED() CALL RANDOM_NUMBER(A) CALL RANDOM_NUMBER(B) CALL CPU_TIME(time_begin) C=matmul(A, B) CALL CPU_TIME(time_end) WRITE(*,*)"consumed CPU_time(s):", time_end - time_begin end program
O1 O2 O3 优化集合(无快速浮点优化) 优化集合 gfortran4.4.4 15.500 15.563 15.688 15.656 15.469 Intel Fortran Compiler 11.1 37.734 37.359 4.484 5.047 4.953
矩阵相乘测试(调用原始blas)
blas代码来源:http://www.netlib.org/lapack/
main.f90
program main implicit none real(kind = 8) :: A(2000, 2000), B(2000, 2000), C(2000, 2000) real(kind = 8) :: time_begin, time_end CALL RANDOM_SEED() CALL RANDOM_NUMBER(A) CALL RANDOM_NUMBER(B) CALL CPU_TIME(time_begin) CALL dgemm('N', 'N', 2000, 2000, 2000, 1.0_8, A, 2000, B, 2000, 0.0_8, C, 2000) CALL CPU_TIME(time_end) WRITE(*,*)"consumed CPU_time(s):", time_end - time_begin end program
O1 O2 O3 优化集合(无快速浮点优化) 优化集合 gfortran4.4.4 18.500 17.844 17.391 17.016 17.156 Intel Fortran Compiler 11.1 14.938 13.969 13.938 18.227 18.430
五、结论
Intel Compiler在测试中表现良好,尤其对内置函数进行了比较多的优化,VS2008亦表现不错,
gcc除了在三角函数计算里远远落后外,其他的性能表现也还是不错的,考虑到gcc的开源跨平台,因此
占有比Intel Compiler和M$ Compiler更重要的位置。
作者:PcX
出处:http://www.cnblogs.com/xunxun1982/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。