///////////////////////////////////////////////////////////////////////// // // Performance benchmarking program for various normalize functions // // by Elvic Liang // ///////////////////////////////////////////////// #include <math.h> #include <xmmintrin.h> #include <time.h> struct Vector { float x, y, z; inline Vector() {} inline Vector(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {} inline Vector operator * (float rhs) const { Vector temp; temp.x = x * rhs; temp.y = y * rhs; temp.z = z * rhs; return temp; } }; template <typename T> inline T max(T a, T b) { return ((a > b) ? a : b); } inline float rcpf(float x) { #ifdef _MSC_VER return 1.0f / x; #else const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rcp_ss(a); // one more iteration return _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(r, r), _mm_mul_ss(_mm_mul_ss(r, r), a))); #endif } inline float invsqrtf(float x) { const __m128 a = _mm_max_ss(_mm_set_ss(x), _mm_set_ss(1.0e-30f)); const __m128 r = _mm_rsqrt_ss(a); // one more iteration return _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f), _mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r))))); } inline float fastinvsqrt(float x) { float xhalf = 0.5f * x; int i = *(int *)&x; i = 0x5f3759df - (i >> 1); x = *(float *)&i; x = x * (1.5f - xhalf * x * x); return x; } inline float fastsqrt(float x) { union { int intPart; float floatPart; } convertor; union { int intPart; float floatPart; } convertor2; convertor.floatPart = x; convertor2.floatPart = x; convertor.intPart = 0x1fbcf800 + (convertor.intPart >> 1); convertor2.intPart = 0x5f3759df - (convertor2.intPart >> 1); return 0.5f * (convertor.floatPart + (x * convertor2.floatPart)); } inline float dot(const Vector & a, const Vector & b) { return (a.x * b.x + a.y * b.y + a.z * b.z); } inline float len(const Vector & a) { const float l = dot(a, a); return sqrtf(max(0.0f, l)); } inline Vector normalize_ref(const Vector & a) { float length = sqrtf(max(0.0f, a.x * a.x + a.y * a.y + a.z * a.z)); // Using division gives higher precision than multiplying (1/length) return Vector(a.x / length, a.y / length, a.z / length); } inline Vector normalize(const Vector & a) { return a * invsqrtf(dot(a, a)); } inline Vector normalize_v1(const Vector & a) { const __m128 pa = _mm_max_ss(_mm_set_ss(a.x * a.x + a.y * a.y + a.z * a.z), _mm_set_ss(1.0e-30f)); const __m128 r = _mm_rsqrt_ss(pa); // one more iteration const float d = _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f), _mm_mul_ss(_mm_mul_ss(pa, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r))))); return a * d; } inline Vector normalize_v2(const Vector & a) { return a * fastinvsqrt(dot(a, a)); } inline Vector normalize_v3(const Vector & a) { // TODO: Use SSE 4.2 dot product intrinsic when available const __m128 x = _mm_set_ps(1.0f, a.z, a.y, a.x); const __m128 s = _mm_mul_ps(x, x); const __m128 t = _mm_add_ss(s, _mm_movehl_ps(s, s)); const __m128 pa = _mm_max_ss(_mm_add_ss(t, _mm_shuffle_ps(t, t, 1)), _mm_set_ss(1.0e-30f)); const __m128 r = _mm_rsqrt_ss(pa); // one more iteration return a * _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f), _mm_mul_ss(_mm_mul_ss(pa, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r))))); } inline float normalize_len(Vector & r, const Vector & a) { const float l = len(a); const float d = max(l, 1.0e-30f); r = a * rcpf(d); return d; } inline float normalize_len_v1(Vector & r, const Vector & a) { const float d = sqrtf(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z)); r = a * rcpf(d); return d; } inline float normalize_len_v2(Vector & r, const Vector & a) { const float d = sqrtf(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z)); const __m128 pa = _mm_set_ss(d); const __m128 pr = _mm_rcp_ss(pa); // one more iteration const float rd = _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa))); r = a * rd; return d; } inline float normalize_len_v3(Vector & r, const Vector & a) { const __m128 pa = _mm_sqrt_ss(_mm_max_ss(_mm_set_ss(1.0e-30f), _mm_set_ss(a.x * a.x + a.y * a.y + a.z * a.z))); const __m128 pr = _mm_rcp_ss(pa); // one more iteration const float rd = _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa))); r = a * rd; return _mm_cvtss_f32(pa); } inline float normalize_len_v4(Vector & r, const Vector & a) { const float d = fastsqrt(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z)); r = a * rcpf(d); return d; } inline float normalize_len_v5(Vector & r, const Vector & a) { // TODO: Use SSE 4.2 dot product intrinsic when available const __m128 x = _mm_set_ps(1.0f, a.z, a.y, a.x); const __m128 s = _mm_mul_ps(x, x); const __m128 t = _mm_add_ss(s, _mm_movehl_ps(s, s)); const __m128 pa = _mm_sqrt_ss( _mm_max_ss(_mm_add_ss(t, _mm_shuffle_ps(t, t, 1)), _mm_set_ss(1.0e-30f))); const __m128 pr = _mm_rcp_ss(pa); // one more iteration r = a * _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa))); return _mm_cvtss_f32(pa); } struct Random { unsigned int state; inline Random(unsigned int seed = 0x9e3779b1) { state = hash(seed); } inline unsigned int hash(unsigned int a) { a = (a+0x7ed55d16) + (a<<12); a = (a^0xc761c23c) ^ (a>>19); a = (a+0x165667b1) + (a<<5); a = (a+0xd3a2646c) ^ (a<<9); a = (a+0xfd7046c5) + (a<<3); a = (a^0xb55a4f09) ^ (a>>16); return a; } inline float next_float() { state = hash(state); return (state & 0xFFFFFF) * (1.0f / float(1 << 24)); } inline float next() { return (next_float() * 1000.0f - 500.0f); } }; int get_time() { return (int)clock(); } int main(int argc, char* argv[]) { const int NTEST = 100000000; int rand_time = 0; { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v; v.x = random.next(); v.y = random.next(); v.z = random.next(); sum += (v.x + v.y + v.z); } rand_time = get_time() - start_time; printf("random: sum = %f time = %d\n", sum, rand_time); } printf("testing performance...\n"); { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); r = normalize_ref(v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_ref (reference): sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); r = normalize(v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); r = normalize_v1(v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_v1: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); r = normalize_v2(v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_v2 (fast): sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); r = normalize_v3(v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_v3: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); normalize_len(r, v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_len: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); normalize_len_v1(r, v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_len_v1: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); normalize_len_v2(r, v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_len_v2: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); normalize_len_v3(r, v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_len_v3: sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); normalize_len_v4(r, v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_len_v4 (fast): sum = %f time = %d\n", sum, done_time); } { int start_time = get_time(); Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r; v.x = random.next(); v.y = random.next(); v.z = random.next(); normalize_len_v5(r, v); sum += (r.x + r.y + r.z); } int done_time = get_time() - start_time - rand_time; printf("normalize_len_v5: sum = %f time = %d\n", sum, done_time); } printf("testing precision...\n"); { float max_error = 0.0f; Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r1, r2; v.x = random.next(); v.y = random.next(); v.z = random.next(); r1 = normalize_ref(v); r2 = normalize_v1(v); sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z))); } printf("normalize_v1: sum = %f max. error = %.17f\n", sum, max_error); } { float max_error = 0.0f; Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r1, r2; v.x = random.next(); v.y = random.next(); v.z = random.next(); r1 = normalize_ref(v); r2 = normalize_v2(v); sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z))); } printf("normalize_v2 (fast): sum = %f max. error = %.17f\n", sum, max_error); } { float max_error = 0.0f; Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r1, r2; v.x = random.next(); v.y = random.next(); v.z = random.next(); r1 = normalize_ref(v); r2 = normalize_v3(v); sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z))); } printf("normalize_v3: sum = %f max. error = %.17f\n", sum, max_error); } { float max_error = 0.0f; Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r1, r2; v.x = random.next(); v.y = random.next(); v.z = random.next(); r1 = normalize_ref(v); normalize_len_v3(r2, v); sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z))); } printf("normalize_len_v3: sum = %f max. error = %.17f\n", sum, max_error); } { float max_error = 0.0f; Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r1, r2; v.x = random.next(); v.y = random.next(); v.z = random.next(); r1 = normalize_ref(v); normalize_len_v4(r2, v); sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z))); } printf("normalize_len_v4 (fast): sum = %f max. error = %.17f\n", sum, max_error); } { float max_error = 0.0f; Random random; double sum = 0.0; for (int i = 0; i < NTEST; ++i) { Vector v, r1, r2; v.x = random.next(); v.y = random.next(); v.z = random.next(); r1 = normalize_ref(v); normalize_len_v5(r2, v); sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z); max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z))); } printf("normalize_len_v5: sum = %f max. error = %.17f\n", sum, max_error); } return 0; }
输出数据:
random: sum = 296444.360077 time = 2667
testing performance...
normalize_ref (reference): sum = 3368.279413 time = 2528
normalize: sum = 3368.280312 time = 1451
normalize_v1: sum = 3368.280312 time = 1530
normalize_v2 (fast): sum = 3360.893364 time = 671
normalize_v3: sum = 3368.279959 time = 1405
normalize_len: sum = 3368.279241 time = 2465
normalize_len_v1: sum = 3368.279241 time = 2200
normalize_len_v2: sum = 3368.280580 time = 2481
normalize_len_v3: sum = 3368.280580 time = 1717
normalize_len_v4 (fast): sum = 3399.301634 time = 2044
normalize_len_v5: sum = 3368.280429 time = 1514
testing precision...
normalize_v1: sum = 6736.559725 max. error = 0.00000005960464478
normalize_v2 (fast): sum = 6729.172777 max. error = 0.00128239393234253
normalize_v3: sum = 6736.559372 max. error = 0.00000011920928955
normalize_len_v3: sum = 6736.559993 max. error = 0.00000005960464478
normalize_len_v4 (fast): sum = 6767.581047 max. error = 0.01472616195678711
normalize_len_v5: sum = 6736.559842 max. error = 0.00000005960464478
最后结论:
normalize_v3 性价比最高。如果你在3D游戏、或者其它交互性要求很高的场合使用normalize,可以考虑使用这个快速实现,只需要平台支持SSE 2.0即可。如果你对精度要求不高,可以考虑使用normalize_v2 (fast)版本的实现,它使用了Quake 3游戏引擎中的快速开平方根算法。