SSE 向量乘矩阵
struct Vector4 { float x, y, z, w; }; struct Matrix { float _M[4][4]; }; void SSE_VectorMultiplyMatrix(const Vector4& v,const Matrix& m1,Vector4& ret) { Vector4 va,vb,vc,vd; Vector4 *pva,*pvb,*pvc,*pvd; const Vector4 *pv; //取出矩阵每一列 va.x = m1._M[0][0]; va.y = m1._M[1][0]; va.z = m1._M[2][0]; va.w = m1._M[3][0]; vb.x = m1._M[0][1]; vb.y = m1._M[1][1]; vb.z = m1._M[2][1]; vb.w = m1._M[3][1]; vc.x = m1._M[0][2]; vc.y = m1._M[1][2]; vc.z = m1._M[2][2]; vc.w = m1._M[3][2]; vd.x = m1._M[0][3]; vd.y = m1._M[1][3]; vd.z = m1._M[2][3]; vd.w = m1._M[3][3]; pva = &va; pvb = &vb; pvc = &vc; pvd = &vd; pv = &v; __asm { //矩阵四列放入mmx0-mmx3 MOV EAX, pva // Load pointer into CPU reg MOVUPS XMM0, [EAX] MOV EAX, pvb // Load pointer into CPU reg MOVUPS XMM1, [EAX] MOV EAX, pvc // Load pointer into CPU reg MOVUPS XMM2, [EAX] MOV EAX, pvd // Load pointer into CPU reg MOVUPS XMM3, [EAX] //向量放入 mmx4 MOV EAX, pv MOVUPS XMM4, [EAX] //向量点乘矩阵每列 MULPS XMM0,XMM4 MULPS XMM1,XMM4 MULPS XMM2,XMM4 MULPS XMM3,XMM4 //输出四个分量 MOVUPS [va], XMM0 MOVUPS [vb], XMM1 MOVUPS [vc], XMM2 MOVUPS [vd], XMM3 } //四个分量求和得变换后向量 ret.x = va.w + va.x + va.y + va.z; ret.y = vb.w + vb.x + vb.y + vb.z; ret.z = vc.w + vc.x + vc.y + vc.z; ret.w = vd.w + vd.x + vd.y + vd.z; }