09 浮点和媒体指令
1. x87浮点指令:
相当于栈结构,st(0)是栈顶。
__asm {
fld flt1 //float load
fld flt2
fadd st(0), st(1) //float add
faddp st(1), st(0) //float add and pop 结果加到st(1)中,之后st(0)弹出,st(1)中的值就到st(0)
fst fltSum //float store
fstp fltSum //float store and pop
//也可整数相加
fild nNum1 //load interger
fild nNum2
faddp st(1), st(0)
fistp nSum //store interger and pop
}
2. MMX Multi Media eXtension 多媒体扩展指令
指令本意是允许多个操作并行执行,称为单指令多数据,即SIMD,Single Instruction Multiple Data,这种模式可以复制多个操作数,将它们打包到大型寄存器并行执行统一操作。
64bit的MM0 ~ MM7:实际上就是ST0 ~ ST7
int nNum1 = 1;
int nNum2 = 2;
int nSum = 0;
char ary1[] = { 255, 1, 3, 4, 5, 6, 0, 8 };
char ary2[] = { 255, 1, 3, 4, 5, 6, 1, 8 };
__asm {
movq mm0, ary1 //move quardword
paddb mm0,ary2 //add packed byte intergers,每个都当作字节来相加,因此不会进位
movq ary2, mm0
movd mm2, nNum1 //move doubleword
movq mm1, nNum1 //move quardword
paddd mm1, mm2 //add packed intergers,每个都当作dword来相加
paddq mm1, mm2 //add packed quardword intergers
movq mm0, mm1
}
可以不写内联汇编,而是包含#include <mmintrin.h>使用xmm指令集。
__m64 m64Num1;
m64Num1.m64_u8[0] = 255;
m64Num1.m64_u8[1] = 1;
m64Num1.m64_u8[2] = 2;
m64Num1.m64_u8[3] = 3;
m64Num1.m64_u8[4] = 4;
m64Num1.m64_u8[5] = 5;
m64Num1.m64_u8[6] = 6;
m64Num1.m64_u8[7] = 7;
__m64 m64Num2 = { 0 };
m64Num2 = _m_paddb(m64Num1, m64Num1);
printf("%lld", m64Num2);
Intel的XMM指令集不支持浮点运算,AMD支持,号称3D Now!(3D No waiting!)。
3. SSE Streaming SIMD Extention 流式SIMD扩展
提供了128bit的XMM0 ~ XMM7
float flt1 = 1.1f;
float flt2 = 1.2f;
float fltSum = 0.0;
fltSum = flt1 + flt2;
float ary1[] = { 1.1f, 1.2f, 2.3f, 3.4f, };
float ary2[] = { 1.1f, 1.1f, 2.5f, 3.4f, };
__asm {
movups xmm0, ary2 //move unaligned packed signle
addps xmm0, ary1 //addps如果内存未对齐到16字节,也会崩溃
movaps ary1, xmm0 //move aligned packed single, 要对齐到16字节
movss xmm1, flt1 //move scalar single
addss xmm1, flt2 //add scalar single
addsd xmm1, flt2 //add scalar double
movss fltSum, xmm1
cvttss2si eax, xmm0 //convert scalar single to int
}
可以不写内联汇编,而是包含头文件:
#include <xmmintrin.h>//SSE
#include <emmintrin.h>//SSE2
#include <pmmintrin.h>//SSE3
#include <smmintrin.h>//SSE4
float ary1[] = { 1.1, 1.2, 2.3, 3.4, };// 4.5, 5.6, 0.7, 8.7};
float ary2[] = { 1.1, 1.1, 2.5, 3.4, };//4.5, 5.2, 0.7, 8.4 };
__m128 m128Num1;
m128Num1.m128_f32[0] = 0.0f;
m128Num1.m128_f32[1] = 1.1f;
m128Num1.m128_f32[2] = 2.2f;
m128Num1.m128_f32[3] = 3.3f;
__m128 m128Num2 = _mm_add_ps(m128Num1, m128Num1);
4. AVX Advanced Vector Extension 高级向量扩展
提供了256bit的YMM0 ~ YMM7,低128bit是原来的XMM0 ~ XMM7
//基本上在SSE指令前加上v即可
__asm {
vmovups xmm0, ary2
vaddps xmm0, xmm1, ary1 //3操作数,后两个相加,结果保存在第一个操作数内
vmovups ary1, xmm0
vmovss xmm1, flt1
vaddss xmm1, xmm2, flt2
vaddsd xmm1, xmm2, flt2
vmovss fltSum, xmm1
vaddpd ymm1, ymm2, m256Num1
}
可以不写内联汇编,而是包含#include <immintrin.h>使用AVX指令集。
__m256d m256Num1;
m256Num1.m256d_f64[0] = 0.0;
m256Num1.m256d_f64[1] = 1.1;
m256Num1.m256d_f64[2] = 2.2;
m256Num1.m256d_f64[3] = 3.3;
__m256d m256Num2 = _mm256_add_pd(m256Num1, m256Num1);