Intrinsics实例之三维矩阵变换
1.矩阵变换:三维变换原理图如下
通常顶点有两种表示形式:
// vertex 表示有N个点
- 结构数组SOA(structures of array)
1 typedef struct SOA 2 { 3 float x[N], y[N], z[N], w[N]; 4 }SOA; 5 SOA vertex;
-数组结构AOS(Arrays of structure)
1 typedef struct AOS 2 { 3 float x, y, z, w; 4 }AOS; 5 AOS vertex[N];
使用结构数组SOA要比AOS加速更快, SOA的实现代码如下:
1 int transform_intrinsics( float matrix[4][4], SOA *pVertex0, SOA *pVertex2, int n) 2 { 3 __m128 mx, my, mz, mw; 4 __m128 mx1, my1, mz1, mw1; 5 __m128 result; 6 // make loop can not access the length less than 4 7 int limit_remainder = N -3; 8 int i = 0; 9 for (; i < limit_remainder; i += 4) 10 { 11 // load vertex0[0],vertex[1],vertex[2],vertex[3] 12 mx = _mm_loadu_ps(pVertex0->x + i); 13 mx1 = matrix[0][0] * mx; 14 my = _mm_loadu_ps(pVertex0->y + i); 15 my1 = matrix[0][1] * my; 16 mz = _mm_loadu_ps(pVertex0->z + i); 17 mz1 = matrix[0][2] * mz; 18 mw = _mm_loadu_ps(pVertex0->w + i); 19 mw1 = matrix[0][3] * mw; 20 result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1)); 21 _mm_storeu_ps(pVertex2->x + i, result); 22 23 mx1 = matrix[1][0] * mx; 24 my1 = matrix[1][1] * my; 25 mz1 = matrix[1][2] * mz; 26 mw1 = matrix[1][3] * mw; 27 result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1)); 28 _mm_storeu_ps(pVertex2->y + i, result); 29 30 mx1 = matrix[2][0] * mx; 31 my1 = matrix[2][1] * my; 32 mz1 = matrix[2][2] * mz; 33 mw1 = matrix[2][3] * mw; 34 result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1)); 35 _mm_storeu_ps(pVertex2->z + i, result); 36 37 mx1 = matrix[3][0] * mx; 38 my1 = matrix[3][1] * my; 39 mz1 = matrix[3][2] * mz; 40 mw1 = matrix[3][3] * mw; 41 result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1)); 42 _mm_storeu_ps(pVertex2->w + i, result); 43 } 44 if (i < n) 45 { 46 do{ 47 pVertex2->x[i] += matrix[0][0] * pVertex0->x[i] + matrix[0][1] * pVertex0->y[i] + \ 48 matrix[0][2] * pVertex0->z[i] + matrix[0][3] * pVertex0->w[i]; 49 pVertex2->y[i] += matrix[1][0] * pVertex0->x[i] + matrix[1][1] * pVertex0->y[i] + \ 50 matrix[1][2] * pVertex0->z[i] + matrix[1][3] * pVertex0->w[i]; 51 pVertex2->z[i] += matrix[2][0] * pVertex0->x[i] + matrix[2][1] * pVertex0->y[i] + \ 52 matrix[2][2] * pVertex0->z[i] + matrix[2][3] * pVertex0->w[i]; 53 pVertex2->w[i] += matrix[3][0] * pVertex0->x[i] + matrix[3][1] * pVertex0->y[i] + \ 54 matrix[3][2] * pVertex0->z[i] + matrix[3][3] * pVertex0->w[i]; 55 }while( ++i < n); 56 } 57 }
AOS的实现代码为:这儿没有进行循环展开
1 int transform_intrinsics( float p_matrix[ 4 ][ 4 ], 2 float p_vectors[][ 4 ], 3 float p_outputs[][ 4 ], int length ) 4 { 5 int i = 0; 6 __m128 row0,row1,row2,row3; 7 __m128 row_output; 8 __m128 tmp1, tmp2; 9 row0 = _mm_loadu_ps(p_matrix[0]); 10 row1 = _mm_loadu_ps(p_matrix[1]); 11 row2 = _mm_loadu_ps(p_matrix[2]); 12 row3 = _mm_loadu_ps(p_matrix[3]); 13 _MM_TRANSPOSE4_PS(row0,row1,row2,row3); 14 for( ; i < length; i++ ) 15 { 16 tmp1 = _mm_add_ps(p_vectors[i][0] * row0,p_vectors[i][1] * row1); 17 tmp2 = _mm_add_ps(p_vectors[i][2] * row2,p_vectors[i][3] * row3); 18 row_output = _mm_add_ps(tmp1,tmp2); 19 _mm_storeu_ps(p_outputs[i], row_output); 20 } 21 return 0; 22 }