测试结果:
sum (fast) in clock 1562
sum (fast2) in clock 1407
sum (fast3) in clock 3156
sum in clock 7797
Error is 1.512115
Error2 is 0.030914
Error3 is 0.001389
#include <stdio.h> #include <xmmintrin.h> #define NOMINMAX #include <windows.h> #include <math.h> #include <time.h> /* * (c) Ian Stephenson * * ian@dctsystems.co.uk * * Fast pow() reference implementation */ /* * http://www.dctsystems.co.uk/Software/power.html * http://www.dctsystems.co.uk/Software/power.c */ const float shift23=(1<<23); const float OOshift23=1.0/(1<<23); __forceinline float myFloorf(float a) { return (float)((int)a - (a < 0.0f)); } __forceinline float myLog2(float i) { float LogBodge=0.346607f; float x; float y; x=(float)(*(int *)&i); x*= OOshift23; //1/pow(2,23); x=x-127; y=x-myFloorf(x); y=(y-y*y)*LogBodge; return x+y; } __forceinline float myPow2(float i) { float PowBodge=0.33971f; float x; float y=i-myFloorf(i); y=(y-y*y)*PowBodge; x=i+127-y; x*= shift23; //pow(2,23); *(int*)&x=(int)x; return x; } __forceinline float myPow(float a, float b) { return myPow2(b*myLog2(a)); } /////////////////////////////////////// /* Code below are from http://code.google.com/p/fastapprox/ */ __forceinline float fastpow2(float p) { float offset = (p < 0) ? 1.0f : 0.0f; float clipp = (p < -126) ? -126.0f : p; int w = (int)clipp; float z = clipp - w + offset; union { unsigned int i; float f; } v = { (unsigned int)((1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z)) }; return v.f; } __forceinline float fastlog2(float x) { union { float f; unsigned int i; } vx = { x }; union { unsigned int i; float f; } mx = { (vx.i & 0x007FFFFF) | 0x3f000000 }; float y = (float)vx.i; y *= 1.1920928955078125e-7f; return y - 124.22551499f - 1.498030302f * mx.f - 1.72587999f / (0.3520887068f + mx.f); } __forceinline float fastpow(float x, float p) { return fastpow2(p * fastlog2(x)); } ///////////////////////////////////////////////// #define FLT_MIN 1.175494351e-38F #define FLT_MAX 3.402823466e+38F template <typename T> __forceinline T min(T a, T b) { return ((a < b) ? a : b); } __forceinline float fast_fabs(float x) { union { float f; unsigned int i; } v = {x}; v.i &= 0x7FFFFFFF; return v.f; } /// Multiply and add: (a * b) + c template <typename T> __forceinline T madd (const T& a, const T& b, const T& c) { // NOTE: in the future we may want to explicitly ask for a fused // multiply-add in a specialized version for float. // NOTE2: GCC/ICC will turn this (for float) into a FMA unless // explicitly asked not to, clang seems to leave the code alone. return a * b + c; } template <typename IN_TYPE, typename OUT_TYPE> __forceinline OUT_TYPE bit_cast (const IN_TYPE in) { union { IN_TYPE in_val; OUT_TYPE out_val; } cvt; cvt.in_val = in; return cvt.out_val; } __forceinline float fast_log2 (float x) { // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans if (x < FLT_MIN) x = FLT_MIN; if (x > FLT_MAX) x = FLT_MAX; // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h unsigned bits = bit_cast<float, unsigned>(x); int exponent = int(bits >> 23) - 127; float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f; // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error // ulp histogram: // 0 = 97.46% // 1 = 2.29% // 2 = 0.11% float f2 = f * f; float f4 = f2 * f2; float hi = madd(f, -0.00931049621349f, 0.05206469089414f); float lo = madd(f, 0.47868480909345f, -0.72116591947498f); hi = madd(f, hi, -0.13753123777116f); hi = madd(f, hi, 0.24187369696082f); hi = madd(f, hi, -0.34730547155299f); lo = madd(f, lo, 1.442689881667200f); return ((f4 * hi) + (f * lo)) + exponent; } __forceinline float fast_exp2 (float x) { // clamp to safe range for final addition if (x < -126.0f) x = -126.0f; if (x > 126.0f) x = 126.0f; // range reduction int m = int(x); x -= m; x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!) // 5th degree polynomial generated with sollya // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp // ulp histogram: // 0 = 87.81% // 1 = 4.18% float r = 1.33336498402e-3f; r = madd(x, r, 9.810352697968e-3f); r = madd(x, r, 5.551834031939e-2f); r = madd(x, r, 0.2401793301105f); r = madd(x, r, 0.693144857883f); r = madd(x, r, 1.0f); // multiply by 2 ^ m by adding in the exponent // NOTE: left-shift of negative number is undefined behavior return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23)); } __forceinline float fast_safe_pow (float x, float y) { if (y == 0) return 1.0f; // x^0=1 if (x == 0) return 0.0f; // 0^y=0 // be cheap & exact for special case of squaring and identity if (y == 1.0f) return x; if (y == 2.0f) return min (x*x, FLT_MAX); float sign = 1.0f; if (x < 0) { // if x is negative, only deal with integer powers // powf returns NaN for non-integers, we will return 0 instead int ybits = bit_cast<float, int>(y) & 0x7fffffff; if (ybits >= 0x4b800000) { // always even int, keep positive } else if (ybits >= 0x3f800000) { // bigger than 1, check int k = (ybits >> 23) - 127; // get exponent int j = ybits >> (23 - k); // shift out possible fractional bits if ((j << (23 - k)) == ybits) // rebuild number and check for a match sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd else return 0.0f; // not integer } else { return 0.0f; // not integer } } return sign * fast_exp2(y * fast_log2(fast_fabs(x))); } ///////// int main(int argc, char *argv[]) { const int N = 100000000; float *buf = new float[N]; float *a = new float[N]; float *b = new float[N]; float *c = new float[N]; float *d = new float[N]; for (int i = 0; i < N; ++i) { buf[i] = 1000.0f * (float)rand() / (float)RAND_MAX; } int start_time; start_time = clock(); for (int i = 0; i < N; ++i) { a[i] = myPow(buf[i], 0.8f); } printf("sum (fast) in clock %d\n", clock() - start_time); start_time = clock(); for (int i = 0; i < N; ++i) { c[i] = fastpow(buf[i], 0.8f); } printf("sum (fast2) in clock %d\n", clock() - start_time); start_time = clock(); for (int i = 0; i < N; ++i) { d[i] = fast_safe_pow(buf[i], 0.8f); } printf("sum (fast3) in clock %d\n", clock() - start_time); start_time = clock(); for (int i = 0; i < N; ++i) { b[i] = powf(buf[i], 0.8f); } printf("sum in clock %d\n", clock() - start_time); float max_err = 0.0f; for (int i = 0; i < N; ++i) { float err = fabsf(a[i] - b[i]); if (err > max_err) max_err = err; } printf("Error is %f\n", max_err); max_err = 0.0f; for (int i = 0; i < N; ++i) { float err = fabsf(b[i] - c[i]); if (err > max_err) max_err = err; } printf("Error2 is %f\n", max_err); max_err = 0.0f; for (int i = 0; i < N; ++i) { float err = fabsf(b[i] - d[i]); if (err > max_err) max_err = err; } printf("Error3 is %f\n", max_err); delete[]buf; delete[]a; delete[]b; delete[]c; delete[]d; return 0; }