loop

    4.4
s* 17.5

--mul
r 13.1
e 18.62
x 4.4 //bmi2
s* 17.5
--div
r 379 //125
e 114 //94
ss 48.35
sd 57

--vpslld
4.4




.global _k
_k:
movl %edi, %eax
shll $32, %eax
mov %xmm0, %rax
addl %ecx, (%eax)
ret

.global _t
_t:
mov $2, %rdx
mov $0xf1234567, %rcx
aa:
mulx %rcx, %rdi, %rsi
loop aa
ret

.global _f0
_f0:
ret

#define A() addss %xmm0, %xmm0
//mulx %rcx, %rsi, %rdi
.global _kg
_kg:
mov $0xf1234567, %rcx
mov $0, %rdx
a0:
//vpaddd %ymm0, %ymm0, %ymm0
//vpslld $20, %xmm0, %xmm0
//div %rcx
//shr $1, %rax
//divsd %xmm0, %xmm0
//add %rax, %rax
//add %r8, %r8
//add %r9, %r9
//add %rsi, %rsi
//add %rdi, %rsi //4 loop is better

A()
A()
A()
A()
//A()
sub $1, %rcx
jnz a0 //loop too slow
ret
posted @ 2022-06-07 10:40  zJanly  阅读(165)  评论(0编辑  收藏  举报