图像二值化的指令集加速例子(一)
以下代码基于VS2017、Qt5.9和OpenCV430,CPU型号是Intel Core i5-7400。功能是对图像进行二值化。下面直接上代码:
void main() { Mat image(1024, 1024, CV_8UC1, Scalar(255)); circle(image, Point2i(500, 500), 200, Scalar(0), -1); int64 t1, t2; Mat binar1(image.size(), image.type()); Mat binar2(image.size(), image.type()); // 确保32字节对齐 ASSERT(int64(image.data) % 32 == 0); ASSERT(int64(binar1.data) % 32 == 0); ASSERT(int64(binar2.data) % 32 == 0); t1 = getTickCount(); threshold(image, binar1, 127, 255, THRESH_BINARY); t2 = getTickCount(); qDebug() << u8"OPENCV(ms):" << (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); for (int i = 0; i < 1024; i++) { const uchar* line = image.ptr<uchar>(i); uchar* dest = binar2.ptr<uchar>(i); for (int j = 0; j < 1024; j++) { dest[j] = line[j] > 127 ? 255 : 0; } } t2 = getTickCount(); qDebug() << u8"NONE(ms):" << (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); __m128i m128t = _mm_set_epi16(127, 127, 127, 127, 127, 127, 127, 127); __m128i m128h = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0); for (int i = 0; i < 1024; i++) { const uchar* line = image.ptr<uchar>(i); uchar* dest = binar2.ptr<uchar>(i); for (int j = 0; j < 1024; j += 8) { __m128i mmx08 = _mm_set_epi64x(0, *(int64*)&line[j]); __m128i mmx16 = _mm_cvtepu8_epi16(mmx08); __m128i res = _mm_cmplt_epi16(m128t, mmx16); __m128i half = _mm_shuffle_epi8(res, m128h); *(int64*)&dest[j] = _mm_extract_epi64(half, 0); } } t2 = getTickCount(); qDebug() << u8"SSE(ms):" << (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); __m256i m256t = _mm256_set1_epi16(127); __m256i m256h = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0); for (int i = 0; i < 1024; i++) { const uchar* line = image.ptr<uchar>(i); uchar* dest = binar2.ptr<uchar>(i); for (int j = 0; j < 1024; j += 16) { __m128i mmx08 = _mm_set_epi64x(*(int64*)&line[j + 8], *(int64*)&line[j]); __m256i mmx16 = _mm256_cvtepu8_epi16(mmx08); __m256i res = _mm256_cmpgt_epi16(mmx16, m256t); __m256i half = _mm256_shuffle_epi8(res, m256h); *(int64*)&dest[j] = _mm256_extract_epi64(half, 0); *(int64*)&dest[j + 8] = _mm256_extract_epi64(half, 2); } } t2 = getTickCount(); qDebug() << u8"AVX(ms):" << (t2 - t1) / getTickFrequency() * 1000; }
在Release版下执行50次的输出如下。从这一批次的输出可知AVX优化的运行效率大部分都能超过OpenCV的运行效率:
OPENCV(ms): 2.0732
NONE(ms): 0.7314
SSE(ms): 0.2543
AVX(ms): 0.2199
OPENCV(ms): 0.4455
NONE(ms): 0.7666
SSE(ms): 0.293
AVX(ms): 0.179
OPENCV(ms): 0.5774
NONE(ms): 2.3402
SSE(ms): 0.2871
AVX(ms): 0.2766
OPENCV(ms): 0.3737
NONE(ms): 0.7787
SSE(ms): 0.3047
AVX(ms): 0.3284
OPENCV(ms): 0.3145
NONE(ms): 0.7349
SSE(ms): 0.3549
AVX(ms): 0.3025
OPENCV(ms): 0.4318
NONE(ms): 0.7679
SSE(ms): 2.4315
AVX(ms): 0.2681
OPENCV(ms): 0.3959
NONE(ms): 0.9343
SSE(ms): 0.3756
AVX(ms): 0.439
OPENCV(ms): 0.3512
NONE(ms): 2.4505
SSE(ms): 0.377
AVX(ms): 0.2237
OPENCV(ms): 0.5284
NONE(ms): 0.7935
SSE(ms): 0.4699
AVX(ms): 0.2633
OPENCV(ms): 0.4671
NONE(ms): 0.8124
SSE(ms): 0.2919
AVX(ms): 0.2929
...<输出太多删除一部分>
OPENCV(ms): 0.6298
NONE(ms): 1.6238
SSE(ms): 0.4122
AVX(ms): 0.2643
OPENCV(ms): 0.8655
NONE(ms): 1.0023
SSE(ms): 0.3301
AVX(ms): 0.3396
OPENCV(ms): 0.6918
NONE(ms): 0.8999
SSE(ms): 0.2622
AVX(ms): 0.1829
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)