求点集平均值的指令集加速例子
最近学指令集加速,比较好奇求平均值能不能用指令集加速。于是做了这么一个测试。先说一下结论:不要看普通C++代码循环内部调用了operator+=()函数就觉得慢,其实在Release版优化的情况下效率和使用指令集加速差异在2%左右。所以像求平均值这种简单的运算不需要手工调用指令集优化,编译器可以做的很好。下述代码基于VS2017、Qt5.9和OpenCV430,CPU型号是Intel Core i5-7400。下面是代码:
static double sset = 0; static double nonet = 0; static double opencvt = 0; void main() { vector<Point2f> points1(10000); vector<Point2f> points2(10000); vector<Point2f> points3(10000); randu(points1, 0, 10); randu(points2, 0, 10); randu(points3, 0, 10); int64 t1, t2; Point2f avr; for (int x = 0; x < 100000; x++) { t1 = getTickCount(); for (auto& item : points1) { avr += item; } avr /= 10000.0f; t2 = getTickCount(); nonet += (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); __m128 su128 = _mm_setzero_ps(); for (int i = 0; i < 10000; i += 2) { __m128 mmx = _mm_loadu_ps((float*)&points2[i]); su128 = _mm_add_ps(su128, mmx); } float e1, e2; _MM_EXTRACT_FLOAT(e1, su128, 0); _MM_EXTRACT_FLOAT(e2, su128, 2); avr.x = (e1 + e2) / 10000; _MM_EXTRACT_FLOAT(e1, su128, 1); _MM_EXTRACT_FLOAT(e2, su128, 3); avr.y = (e1 + e2) / 10000; t2 = getTickCount(); sset += (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); __m256 su256 = _mm256_setzero_ps(); for (int i = 0; i < 10000; i += 4) { __m256 mmx = _mm256_loadu_ps((float*)&points3[i]); su256 = _mm256_add_ps(su256, mmx); } __m128 mml = _mm256_extractf128_ps(su256, 0); __m128 mmh = _mm256_extractf128_ps(su256, 1); __m128 res = _mm_add_ps(mml, mmh); _MM_EXTRACT_FLOAT(e1, res, 0); _MM_EXTRACT_FLOAT(e2, res, 2); avr.x = (e1 + e2) / 10000; _MM_EXTRACT_FLOAT(e1, res, 1); _MM_EXTRACT_FLOAT(e2, res, 3); avr.y = (e1 + e2) / 10000; t2 = getTickCount(); opencvt += (t2 - t1) / getTickFrequency() * 1000; } qDebug() << "NONE(ms):" << nonet; qDebug() << "SSE(ms):" << sset; qDebug() << "AVX(ms):" << opencvt; }
下面是运行几次的输出。可以看出指令集加速毫无作用甚至适得其反:
NONE(ms): 1.6826 SSE(ms): 1.6635 AVX(ms): 1.6978 NONE(ms): 1.7824 SSE(ms): 2.0012 AVX(ms): 1.8098 NONE(ms): 1.6637 SSE(ms): 1.817 AVX(ms): 1.7181 NONE(ms): 1.7506 SSE(ms): 2.2308 AVX(ms): 1.7104 NONE(ms): 1.7499 SSE(ms): 2.8038 AVX(ms): 1.783