编译选项:
CXXFLAGS += -mavx2
#include <immintrin.h> #include <cmath> void reset_xy(int *x, int *y, int vector_size) { for (int i = 0; i < vector_size; i++) { x[i] = 1; y[i] = 1; } } template <typename F> double runtime_test(int repeat_times, const F& f) { clock_t start = clock(); int counter = repeat_times; while (counter--) { f(); } clock_t end = clock(); double time = (double)(end - start) / CLOCKS_PER_SEC; return time; } int avx_test() { constexpr int repeat_times = 100; constexpr int vector_size = 1000 * 1000; int x[vector_size] = {0}; int y[vector_size] = {0}; reset_xy(x, y, vector_size); std::cout << "repeat_times = " << repeat_times << " vector_size = " << vector_size << std::endl; auto f_noavx = [&x, &y, vector_size]() { for(int i = 0; i < vector_size; i++) { x[i] += y[i]; } }; double noavx_time = runtime_test(repeat_times, f_noavx); std::cout << "x[0] = " << x[0] << " x[vector_size - 1] = " << x[vector_size - 1] << std::endl; auto f_avx = [&x, &y, vector_size]() { for(int i = 0; i < vector_size; i += 8) { // step 8 is 8 * 32 = 256 bit int *x0 = x + i; int *y0 = y + i; __m256i v1 = _mm256_loadu_si256((const __m256i*)x0); __m256i v2 = _mm256_loadu_si256((const __m256i*)y0); v1 = _mm256_add_epi32(v1, v2); _mm256_storeu_si256 ((__m256i*)x0, v1); } }; reset_xy(x, y, vector_size); double avx_time = runtime_test(repeat_times, f_avx); std::cout << "x[0] = " << x[0] << " x[vector_size - 1] = " << x[vector_size - 1] << std::endl; std::cout << "noavx_time = " << noavx_time << " avx_time = " << avx_time << std::endl; std::cout << "noavx_time / avx_time = "<< noavx_time / avx_time << std::endl; return 0; }