Intrinsics实例之三维矩阵变换

1.矩阵变换:三维变换原理图如下


 




 

 

 

 

 

 

 

 

通常顶点有两种表示形式:

// vertex 表示有N个点

- 结构数组SOA(structures of array)

1 typedef struct SOA
2 {
3 float x[N], y[N], z[N], w[N];
4 }SOA;
5 SOA vertex;


 


 


 


 

-数组结构AOS(Arrays of structure)

1 typedef struct AOS
2 {
3 float x, y, z, w;
4 }AOS;
5 AOS vertex[N];


 


 


 


 

使用结构数组SOA要比AOS加速更快, SOA的实现代码如下:

 

 1 int transform_intrinsics( float matrix[4][4], SOA *pVertex0, SOA *pVertex2, int n)
 2 {
 3     __m128 mx, my, mz, mw;
 4     __m128 mx1, my1, mz1, mw1;
 5     __m128 result;
 6     // make loop can not access the length less than 4
 7     int limit_remainder = N -3;
 8     int i = 0;
 9     for (; i < limit_remainder; i += 4)
10     {
11         // load vertex0[0],vertex[1],vertex[2],vertex[3]
12         mx = _mm_loadu_ps(pVertex0->x + i);
13         mx1 = matrix[0][0] * mx; 
14         my = _mm_loadu_ps(pVertex0->y + i);
15         my1 = matrix[0][1] * my;
16         mz = _mm_loadu_ps(pVertex0->z + i);
17         mz1 = matrix[0][2] * mz;
18         mw = _mm_loadu_ps(pVertex0->w + i);
19         mw1 = matrix[0][3] * mw;
20         result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1));
21         _mm_storeu_ps(pVertex2->x + i, result);
22 
23         mx1 = matrix[1][0] * mx;
24         my1 = matrix[1][1] * my;
25         mz1 = matrix[1][2] * mz;
26         mw1 = matrix[1][3] * mw; 
27         result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1));
28         _mm_storeu_ps(pVertex2->y + i, result);
29 
30         mx1 = matrix[2][0] * mx;
31         my1 = matrix[2][1] * my;
32         mz1 = matrix[2][2] * mz;
33         mw1 = matrix[2][3] * mw; 
34         result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1));
35         _mm_storeu_ps(pVertex2->z + i, result);
36 
37         mx1 = matrix[3][0] * mx;
38         my1 = matrix[3][1] * my;
39         mz1 = matrix[3][2] * mz;
40         mw1 = matrix[3][3] * mw; 
41         result = _mm_add_ps(_mm_add_ps(mx1,my1),_mm_add_ps(mz1,mw1));
42         _mm_storeu_ps(pVertex2->w + i, result);
43     }
44     if (i < n)
45     {
46         do{
47             pVertex2->x[i] += matrix[0][0] * pVertex0->x[i] +  matrix[0][1] * pVertex0->y[i] + \
48                           matrix[0][2] * pVertex0->z[i] +  matrix[0][3] * pVertex0->w[i];
49             pVertex2->y[i] += matrix[1][0] * pVertex0->x[i] +  matrix[1][1] * pVertex0->y[i] + \
50                           matrix[1][2] * pVertex0->z[i] +  matrix[1][3] * pVertex0->w[i];
51             pVertex2->z[i] += matrix[2][0] * pVertex0->x[i] +  matrix[2][1] * pVertex0->y[i] + \
52                           matrix[2][2] * pVertex0->z[i] +  matrix[2][3] * pVertex0->w[i];
53             pVertex2->w[i] += matrix[3][0] * pVertex0->x[i] +  matrix[3][1] * pVertex0->y[i] + \
54                           matrix[3][2] * pVertex0->z[i] +  matrix[3][3] * pVertex0->w[i];
55         }while( ++i < n);
56     }
57 }

 AOS的实现代码为:这儿没有进行循环展开

 1 int transform_intrinsics( float p_matrix[ 4 ][ 4 ], 
 2                           float p_vectors[][ 4 ], 
 3                           float p_outputs[][ 4 ], int length )
 4 {
 5     int i = 0;
 6     __m128 row0,row1,row2,row3;
 7     __m128 row_output;
 8     __m128 tmp1, tmp2;
 9     row0 = _mm_loadu_ps(p_matrix[0]);
10     row1 = _mm_loadu_ps(p_matrix[1]);
11     row2 = _mm_loadu_ps(p_matrix[2]);
12     row3 = _mm_loadu_ps(p_matrix[3]);
13     _MM_TRANSPOSE4_PS(row0,row1,row2,row3);
14     for( ; i < length; i++ )
15     {
16         tmp1 = _mm_add_ps(p_vectors[i][0] * row0,p_vectors[i][1] * row1); 
17         tmp2 = _mm_add_ps(p_vectors[i][2] * row2,p_vectors[i][3] * row3); 
18         row_output = _mm_add_ps(tmp1,tmp2);
19         _mm_storeu_ps(p_outputs[i], row_output);   
20     }
21     return 0;
22 }

 

 

 

 

 

 

posted @ 2014-12-03 15:02  嫣儿  阅读(420)  评论(0编辑  收藏  举报