sse矩阵乘法 应该是1毫秒纯运算1000次 asm 是 啊瑟门波利 内联汇编的意思
#include <intrin.h> #include <math.h> //注意那个四个分量的变量 洗牌,给那个分量对准后 叉乘 (cross),那个是sse的叉乘 struct Vector4 { float x, y, z, w; }; struct Matrix { float _M[4][4]; public: //单位化 void Identity() { ZeroMemory((void*)_M,sizeof(float)*16); _M[0][0] = 1.0f; _M[1][1] = 1.0f; _M[2][2] = 1.0f; _M[3][3] = 1.0f; } //转置矩阵 Matrix Transpose() { Matrix ret; ret._M[0][0] = _M[0][0]; ret._M[1][0] = _M[0][1]; ret._M[2][0] = _M[0][2]; ret._M[3][0] = _M[0][3]; ret._M[0][1] = _M[1][0]; ret._M[1][1] = _M[1][1]; ret._M[2][1] = _M[1][2]; ret._M[3][1] = _M[1][3]; ret._M[0][2] = _M[2][0]; ret._M[1][2] = _M[2][1]; ret._M[2][2] = _M[2][2]; ret._M[3][2] = _M[2][3]; ret._M[0][3] = _M[3][0]; ret._M[1][3] = _M[3][1]; ret._M[2][3] = _M[3][2]; ret._M[3][3] = _M[3][3]; return ret; } }; float Sum(const Vector4 & arg1) { return arg1.x + arg1.y + arg1.z + arg1.w; } Vector4 Normaliz(Vector4 v1) { Vector4 v2; float k = 1/sqrtf(v1.x * v1.x + v1.y * v1.y + v1.z * v1.z); v2.x = v1.x * k; v2.y = v1.y * k; v2.z = v1.z * k; return v2; } //向量相加 Vector4 SSE_VectiorAdd ( const Vector4 &Op_A, const Vector4 &Op_B ) { Vector4 Ret_Vector; __asm { MOV EAX, Op_A // Load pointers into CPU regs MOV EBX, Op_B MOVUPS XMM0, [EAX] // Move unaligned vectors to SSE regs MOVUPS XMM1, [EBX] ADDPS XMM0, XMM1 // v1 + v2 MOVUPS [Ret_Vector], XMM0 // Save the return vector } return Ret_Vector; } //向量叉乘 Vector4 SSE_VectorCross(const Vector4 &Op_A, const Vector4 &Op_B) { Vector4 Ret_Vector; __asm { MOV EAX, Op_A // Load pointers into CPU regs MOV EBX, Op_B MOVUPS XMM0, [EAX] // Move unaligned vectors to SSE regs MOVUPS XMM1, [EBX] MOVAPS XMM2, XMM0 // Make a copy of vector A MOVAPS XMM3, XMM1 // Make a copy of vector B SHUFPS XMM0, XMM0, 0xD8 // 11 01 10 00 Flip the middle elements of A SHUFPS XMM1, XMM1, 0xE1 // 11 10 00 01 Flip first two elements of B MULPS XMM0, XMM1 // Multiply the modified register vectors SHUFPS XMM2, XMM2, 0xE1 // 11 10 00 01 Flip first two elements of the A copy SHUFPS XMM3, XMM3, 0xD8 // 11 01 10 00 Flip the middle elements of the B copy MULPS XMM2, XMM3 // Multiply the modified register vectors SUBPS XMM0, XMM2 // Subtract the two resulting register vectors MOVUPS [Ret_Vector], XMM0 // Save the return vector } return Ret_Vector; } //向量缩放 Vector4 SSE_VectorScale(const Vector4 &Op_A, const float &Op_B) { Vector4 Ret_Vector; __m128 F = _mm_set1_ps(Op_B); // Create a 128 bit vector with four elements Op_B __asm { MOV EAX, Op_A // Load pointer into CPU reg MOVUPS XMM0, [EAX] // Move the vector to an SSE reg MULPS XMM0, F // Multiply vectors MOVUPS [Ret_Vector], XMM0 // Save the return vector } return Ret_Vector; } void SSE_VectorDot(const Vector4 &Op_A, const Vector4 &Op_B,float& ret) { Vector4 v1; //__m128 F = _mm_set1_ps(Op_B); // Create a 128 bit vector with four elements Op_B __asm { MOV EAX, Op_A // Load pointer into CPU reg MOV EBX, Op_B MOVUPS XMM1, [EBX] MOVUPS XMM0, [EAX] // Move the vector to an SSE reg MULPS XMM0,XMM1 // Multiply vectors MOVUPS [v1], XMM0 // Save the return vector } ret = v1.x + v1.y + v1.z; } void SSE_VectorMultiplyMatrix(const Vector4& v,const Matrix& m1,Vector4& ret) { Vector4 va,vb,vc,vd; Vector4 *pva,*pvb,*pvc,*pvd; const Vector4 *pv; //取出矩阵每一列 va.x = m1._M[0][0]; va.y = m1._M[1][0]; va.z = m1._M[2][0]; va.w = m1._M[3][0]; vb.x = m1._M[0][1]; vb.y = m1._M[1][1]; vb.z = m1._M[2][1]; vb.w = m1._M[3][1]; vc.x = m1._M[0][2]; vc.y = m1._M[1][2]; vc.z = m1._M[2][2]; vc.w = m1._M[3][2]; vd.x = m1._M[0][3]; vd.y = m1._M[1][3]; vd.z = m1._M[2][3]; vd.w = m1._M[3][3]; pva = &va; pvb = &vb; pvc = &vc; pvd = &vd; pv = &v; __asm { //矩阵四列放入mmx0-mmx3 MOV EAX, pva // Load pointer into CPU reg MOVUPS XMM0, [EAX] MOV EAX, pvb // Load pointer into CPU reg MOVUPS XMM1, [EAX] MOV EAX, pvc // Load pointer into CPU reg MOVUPS XMM2, [EAX] MOV EAX, pvd // Load pointer into CPU reg MOVUPS XMM3, [EAX] //向量放入 mmx4 MOV EAX, pv MOVUPS XMM4, [EAX] //向量点乘矩阵每列 MULPS XMM0,XMM4 MULPS XMM1,XMM4 MULPS XMM2,XMM4 MULPS XMM3,XMM4 //输出四个分量 MOVUPS [va], XMM0 MOVUPS [vb], XMM1 MOVUPS [vc], XMM2 MOVUPS [vd], XMM3 } //四个分量求和得变换后向量 ret.x = va.w + va.x + va.y + va.z; ret.y = vb.w + vb.x + vb.y + vb.z; ret.z = vc.w + vc.x + vc.y + vc.z; ret.w = vd.w + vd.x + vd.y + vd.z; } void SSE_MatrixMultiplyMatrix(const Matrix& arg1,const Matrix& arg2,Matrix & ret) { Matrix m1,m2; Vector4 v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16; Vector4 *pv1,*pv2,*pv3,*pv4,*pv5,*pv6,*pv7,*pv8; m1 = arg1; m2 = arg2; m2 = m2.Transpose(); pv1 = (Vector4*)&m1._M[0][0]; pv2 = (Vector4*)&m1._M[1][0]; pv3 = (Vector4*)&m1._M[2][0]; pv4 = (Vector4*)&m1._M[3][0]; pv5 = (Vector4*)&m2._M[0][0]; pv6 = (Vector4*)&m2._M[1][0]; pv7 = (Vector4*)&m2._M[2][0]; pv8 = (Vector4*)&m2._M[3][0]; __asm { MOV EAX, pv5 MOV EBX, pv6 MOV ECX, pv7 MOV EDX, pv8 MOVUPS XMM1, [EAX] MOVUPS XMM2, [EBX] MOVUPS XMM3, [ECX] MOVUPS XMM4, [EDX] MOV EAX, pv1 MOVUPS XMM0, [EAX] MULPS XMM1, XMM0 MOVUPS[v1], XMM1 MULPS XMM2, XMM0 MOVUPS[v2], XMM2 MULPS XMM3, XMM0 MOVUPS[v3], XMM3 MULPS XMM4, XMM0 MOVUPS[v4], XMM4 MOV EAX, pv5 MOV EBX, pv6 MOV ECX, pv7 MOV EDX, pv8 MOVUPS XMM1, [EAX] MOVUPS XMM2, [EBX] MOVUPS XMM3, [ECX] MOVUPS XMM4, [EDX] MOV EAX, pv2 MOVUPS XMM0, [EAX] MULPS XMM1, XMM0 MOVUPS[v5], XMM1 MULPS XMM2, XMM0 MOVUPS[v6], XMM2 MULPS XMM3, XMM0 MOVUPS[v7], XMM3 MULPS XMM4, XMM0 MOVUPS[v8], XMM4 MOV EAX, pv5 MOV EBX, pv6 MOV ECX, pv7 MOV EDX, pv8 MOVUPS XMM1, [EAX] MOVUPS XMM2, [EBX] MOVUPS XMM3, [ECX] MOVUPS XMM4, [EDX] MOV EAX, pv3 MOVUPS XMM0, [EAX] MULPS XMM1, XMM0 MOVUPS[v9], XMM1 MULPS XMM2, XMM0 MOVUPS[v10], XMM2 MULPS XMM3, XMM0 MOVUPS[v11], XMM3 MULPS XMM4, XMM0 MOVUPS[v12], XMM4 MOV EAX, pv5 MOV EBX, pv6 MOV ECX, pv7 MOV EDX, pv8 MOVUPS XMM1, [EAX] MOVUPS XMM2, [EBX] MOVUPS XMM3, [ECX] MOVUPS XMM4, [EDX] MOV EAX, pv4 MOVUPS XMM0, [EAX] MULPS XMM1, XMM0 MOVUPS[v13], XMM1 MULPS XMM2, XMM0 MOVUPS[v14], XMM2 MULPS XMM3, XMM0 MOVUPS[v15], XMM3 MULPS XMM4, XMM0 MOVUPS[v16], XMM4 } ret._M[0][0] = Sum(v1); ret._M[0][1] = Sum(v2); ret._M[0][2] = Sum(v3); ret._M[0][3] = Sum(v4); ret._M[1][0] = Sum(v5); ret._M[1][1] = Sum(v6); ret._M[1][2] = Sum(v7); ret._M[1][3] = Sum(v8); ret._M[2][0] = Sum(v9); ret._M[2][1] = Sum(v10); ret._M[2][2] = Sum(v11); ret._M[2][3] = Sum(v12); ret._M[3][0] = Sum(v13); ret._M[3][1] = Sum(v14); ret._M[3][2] = Sum(v15); ret._M[3][3] = Sum(v16); return; }
#include <Windows.h> #include <iostream> #include <math.h> #include "sse.h" using namespace std; void main(int argc, char **argv) { Matrix m1,m2,mRet; m1.Identity(); m2.Identity(); float time; time = GetTickCount(); for (int t = 0; t < 100000000; t++) { SSE_MatrixMultiplyMatrix(m1,m2,mRet); } time = GetTickCount() - time; _asm { int 3 } return; }