SSE图像优化代码-彩色图像转灰度
#include <opencv2/opencv.hpp> using namespace std; void RGB2Y(unsigned char* Src, unsigned char* Dest, int Width, int Height, int Stride) { const int B_WT = int(0.114 * 256 + 0.5); const int G_WT = int(0.587 * 256 + 0.5); const int R_WT = 256 - B_WT - G_WT; // int(0.299 * 256 + 0.5); for (int Y = 0; Y < Height; Y++) { unsigned char* LinePS = Src + Y * Stride; unsigned char* LinePD = Dest + Y * Width; for (int X = 0; X < Width; X++, LinePS += 3) { LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; } } } void RGB2Y1(unsigned char* Src, unsigned char* Dest, int Width, int Height, int Stride) { const int B_WT = int(0.114 * 256 + 0.5); const int G_WT = int(0.587 * 256 + 0.5); const int R_WT = 256 - B_WT - G_WT; // int(0.299 * 256 + 0.5); for (int Y = 0; Y < Height; Y++) { unsigned char* LinePS = Src + Y * Stride; unsigned char* LinePD = Dest + Y * Width; int X = 0; for (; X < Width - 4; X += 4, LinePS += 12) { LinePD[X + 0] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; LinePD[X + 1] = (B_WT * LinePS[3] + G_WT * LinePS[4] + R_WT * LinePS[5]) >> 8; LinePD[X + 2] = (B_WT * LinePS[6] + G_WT * LinePS[7] + R_WT * LinePS[8]) >> 8; LinePD[X + 3] = (B_WT * LinePS[9] + G_WT * LinePS[10] + R_WT * LinePS[11]) >> 8; } for (; X < Width; X++, LinePS += 3) { LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; } } } void RGB2Y2(unsigned char* Src, unsigned char* Dest, int Width, int Height, int Stride) { const int B_WT = int(0.114 * 256 + 0.5); const int G_WT = int(0.587 * 256 + 0.5); const int R_WT = 256 - B_WT - G_WT; // int(0.299 * 256 + 0.5); for (int Y = 0; Y < Height; Y++) { unsigned char* LinePS = Src + Y * Stride; unsigned char* LinePD = Dest + Y * Width; int X = 0; for (; X < Width - 11; X += 12, LinePS += 36) { //_mm_mullo_epi16(_m128i S0, _m128i S1) -- 返回一个_m128i的寄存器, 它含有8个16位整数,分别为S0和S1对应位置的16位的整数相乘结果的低16bit数据 //_mm_cvtepu8_epi16(_m128i S0) -- 返回一个_m128i的寄存器, 它含有8个16位整数,无符号8位到有符号16位的转化 //_mm_loadu_si128(int *p) -- 返回一个_m128i的寄存器,它含有16个8位的整数,不需要地址是8字节对齐 //_mm_setr_epi16() -- 返回一个_m128i的寄存器,它含有8个16位的整数,不需要地址是16字节对齐 __m128i p1aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 0))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); __m128i p2aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 1))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); __m128i p3aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 2))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); __m128i p1aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 8))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); __m128i p2aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 9))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); __m128i p3aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 10))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); __m128i p1bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 18))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); __m128i p2bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 19))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); __m128i p3bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 20))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); __m128i p1bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 26))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); __m128i p2bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 27))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); __m128i p3bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 28))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); //_mm_add_epi16(_m128i S0, _m128i S1) -- 返回一个_m128i的寄存器, 它含有8个16位整数,分别为S0和S1对应位置的16位的整数相加的结果 __m128i sumaL = _mm_add_epi16(p3aL, _mm_add_epi16(p1aL, p2aL)); __m128i sumaH = _mm_add_epi16(p3aH, _mm_add_epi16(p1aH, p2aH)); __m128i sumbL = _mm_add_epi16(p3bL, _mm_add_epi16(p1bL, p2bL)); __m128i sumbH = _mm_add_epi16(p3bH, _mm_add_epi16(p1bH, p2bH)); //_mm_srli_epi16(_m128i S0, int _Count) -- 返回一个_m128i的寄存器, 它含有8个16位整数,将S0中的8个16bit整数按照_Count进行相同的逻辑右移 __m128i sclaL = _mm_srli_epi16(sumaL, 8); __m128i sclaH = _mm_srli_epi16(sumaH, 8); __m128i sclbL = _mm_srli_epi16(sumbL, 8); __m128i sclbH = _mm_srli_epi16(sumbH, 8); //_mm_shuffle_epi8(_m128i S0, _m128i S1) -- 返回一个_m128i的寄存器, 它含有16个8位整数, 将S0中的数据根据S1掩膜进行重组 __m128i shftaL = _mm_shuffle_epi8(sclaL, _mm_setr_epi8(0, 6, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); __m128i shftaH = _mm_shuffle_epi8(sclaH, _mm_setr_epi8(-1, -1, -1, 2, 8, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); __m128i shftbL = _mm_shuffle_epi8(sclbL, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 0, 6, 12, -1, -1, -1, -1, -1, -1, -1)); __m128i shftbH = _mm_shuffle_epi8(sclbH, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 8, 14, -1, -1, -1, -1)); //_mm_or_si128(_m128i S0, _m128i S1)-- 返回一个_m128i的寄存器, 它含有16个8位整数, 分别为S0和S1对应位置的8位的整数进行逻辑或位运算的结果 __m128i accumL = _mm_or_si128(shftaL, shftbL); __m128i accumH = _mm_or_si128(shftaH, shftbH); __m128i h3 = _mm_or_si128(accumL, accumH); //_mm_storeu_si128((__m128i *) p, __m128i S0) -- 将一个_m128i寄存器的值写入指针 _mm_storeu_si128((__m128i*)(LinePD + X), h3); } } } int main() { cv::Mat img = cv::imread("C:\\Users\\Administrator\\Desktop\\test.png"); cv::resize(img, img, cv::Size{ 1920, 1280 }); cv::Mat gray = cv::Mat(img.rows, img.cols , CV_8UC1); const int Cnt = 100; using namespace chrono; auto start = system_clock::now(); for (int m = 0; m < Cnt; ++m) { RGB2Y1(img.data, gray.data, img.rows, img.cols, img.rows * img.channels()); } auto end = system_clock::now(); auto duration = duration_cast<microseconds>(end - start) / Cnt; cout << "time cost:" << double(duration.count()) * microseconds::period::num / microseconds::period::den * 1000 << "ms" << endl; return 0; }
ref: https://blog.csdn.net/qq_48034474/article/details/123404894 https://cloud.tencent.com/developer/article/1011903
实测PC端SSE的加速效果不太好,速度慢了近一倍,不知道是哪里的原因。