SIMD 非连续load

https://acl.inf.ethz.ch/teaching/fastcode/2021/slides/07-simd-avx.pdf
下面两种在O3下的汇编是相同的,在O2下不同

#include <immintrin.h>
__m256d myArbitraryLoad2(double *a, double *b, double *c, double *d) {
__m128d t1, t2, t3, t4;
__m256d t5;
t1 = _mm_load_sd(a); // SSE
t2 = _mm_loadh_pd(t1, b); // SSE
t3 = _mm_load_sd(c); // SSE
t4 = _mm_loadh_pd(t3, d); // SSE
t5 = _mm256_castpd128_pd256(t2); // cast __m128d -> __m256d
return _mm256_insertf128_pd(t5, t4, 1);
}

__m256d myArbitraryLoad3(double *a, double *b, double *c, double *d) {
double g[4];
__m256d vf;
g[0] = *a;
g[1] = *b;
g[2] = *c;
g[3] = *d;
return _mm256_loadu_pd(g);
}
posted @ 2022-01-24 14:09  stdpain  阅读(46)  评论(0编辑  收藏  举报