class __declspec(align(16)) vector4 {
public:
union {
struct { float x, y, z, w; };
__m128 mm;
};
};
inline void dot3( vector4 & r, const vector4 & x, const vector4 & y )
{
/*register vector4 s;
s.mm = _mm_mul_ps( x.mm , y.mm );
r.mm = _mm_add_ss( s.mm , _mm_movehl_ps( s.mm , s.mm ) );
r.mm = _mm_add_ss( r.mm , _mm_shuffle_ps( r.mm , r.mm , 1 ) );*/
__asm {
mov ecx, y
mov edx, x
mov eax, r
movaps xmm0, xmmword ptr [ecx]
movaps xmm1, xmmword ptr [edx]
mulps xmm1, xmm0 // xmm1 = s0, s1, s2, s3
movhlps xmm0, xmm1 // xmm0 = s2, s3, *, *
addss xmm0, xmm1 // xmm0 = s0+s2, s3, *, *
shufps xmm1, xmm1, 1 // xmm1 = s1, s0, s0, s0
addss xmm0, xmm1 // xmm0 = s0+s2+s1, s3, *, *
movaps xmmword ptr [eax], xmm0
}
}
inline void dot4( vector4 *r, const vector4 *x, const vector4 *y )
{
/*register vector4 s;
s.mm = _mm_mul_ps( x->mm , y->mm );
r->mm = _mm_add_ss( s.mm , _mm_movehl_ps( s.mm , s.mm ) );
r->mm = _mm_add_ss( r->mm , _mm_shuffle_ps( r->mm , r->mm , 1 ) );
r->mm = _mm_add_ss( r->mm , _mm_shuffle_ps( r->mm , r->mm , 3 ) );*/
__asm {
mov ecx, y
mov edx, x
mov eax, r
movaps xmm0, xmmword ptr [ecx]
movaps xmm1, xmmword ptr [edx]
mulps xmm1, xmm0 // xmm1 = s0, s1, s2, s3
movhlps xmm0, xmm1 // xmm0 = s2, s3, *, *
addss xmm0, xmm1 // xmm0 = s0+s2, s3, *, *
shufps xmm1, xmm1, 253 // xmm1 = s1, s3, s3, s3
addss xmm0, xmm1 // xmm0 = s0+s2+s1, s3, *, *
movhlps xmm1, xmm1 // xmm1 = s3, s3, s3, s3
addss xmm0, xmm1 // xmm0 = s0+s2+s1+s3, s3, *, *
movaps xmmword ptr [eax], xmm0
}
}
public:
union {
struct { float x, y, z, w; };
__m128 mm;
};
};
inline void dot3( vector4 & r, const vector4 & x, const vector4 & y )
{
/*register vector4 s;
s.mm = _mm_mul_ps( x.mm , y.mm );
r.mm = _mm_add_ss( s.mm , _mm_movehl_ps( s.mm , s.mm ) );
r.mm = _mm_add_ss( r.mm , _mm_shuffle_ps( r.mm , r.mm , 1 ) );*/
__asm {
mov ecx, y
mov edx, x
mov eax, r
movaps xmm0, xmmword ptr [ecx]
movaps xmm1, xmmword ptr [edx]
mulps xmm1, xmm0 // xmm1 = s0, s1, s2, s3
movhlps xmm0, xmm1 // xmm0 = s2, s3, *, *
addss xmm0, xmm1 // xmm0 = s0+s2, s3, *, *
shufps xmm1, xmm1, 1 // xmm1 = s1, s0, s0, s0
addss xmm0, xmm1 // xmm0 = s0+s2+s1, s3, *, *
movaps xmmword ptr [eax], xmm0
}
}
inline void dot4( vector4 *r, const vector4 *x, const vector4 *y )
{
/*register vector4 s;
s.mm = _mm_mul_ps( x->mm , y->mm );
r->mm = _mm_add_ss( s.mm , _mm_movehl_ps( s.mm , s.mm ) );
r->mm = _mm_add_ss( r->mm , _mm_shuffle_ps( r->mm , r->mm , 1 ) );
r->mm = _mm_add_ss( r->mm , _mm_shuffle_ps( r->mm , r->mm , 3 ) );*/
__asm {
mov ecx, y
mov edx, x
mov eax, r
movaps xmm0, xmmword ptr [ecx]
movaps xmm1, xmmword ptr [edx]
mulps xmm1, xmm0 // xmm1 = s0, s1, s2, s3
movhlps xmm0, xmm1 // xmm0 = s2, s3, *, *
addss xmm0, xmm1 // xmm0 = s0+s2, s3, *, *
shufps xmm1, xmm1, 253 // xmm1 = s1, s3, s3, s3
addss xmm0, xmm1 // xmm0 = s0+s2+s1, s3, *, *
movhlps xmm1, xmm1 // xmm1 = s3, s3, s3, s3
addss xmm0, xmm1 // xmm0 = s0+s2+s1+s3, s3, *, *
movaps xmmword ptr [eax], xmm0
}
}