CPU指令集——内存对齐,bayer抽取r、g、b三通道(含镜像)的字节对齐版本
指令集中的函数有要求内存对齐的版本,如 _mm256_stream_si256
也有不要求内存对齐的版本,如 _mm256_storeu_si256
内存对齐的版本效率更高,若使用该版本,申请空间必须内存对齐。new不保证内存对齐,推荐用 _aligned_malloc 和 _aligned_free
#include<opencv2\opencv.hpp> using namespace cv; int main() { String path = "E:/相机采图/相机2448_512/*.bmp"; std::vector<String> paths; glob(path, paths); Mat src, srcValid, src0, src90; Mat dst90_0; unsigned char* pR=nullptr; unsigned char* pG=nullptr; unsigned char* pB=nullptr; for (int i = 0; i < paths.size(); i++) { src = imread(paths[i], 0); //宽度2448不是32的整数倍,用2432,扔掉右侧的18个像素 srcValid = src(Rect(0, 0, 2432, 512)).clone(); //必须深拷贝,否则得到的图像有问题。 //srcValid = src(Rect(0, 0,16,4)).clone(); //srcValid = src.clone(); int width = srcValid.cols; int height = srcValid.rows; const unsigned char* pBayerRG = srcValid.data; //32字节对齐 pR = (unsigned char*)_aligned_malloc(width*height / 4 * sizeof(unsigned char), 32); pG = (unsigned char*)_aligned_malloc(width*height / 4 * sizeof(unsigned char), 32); pB = (unsigned char*)_aligned_malloc(width*height / 4 * sizeof(unsigned char), 32); bayer2rgb_CPU(pBayerRG, width, height, BayerFormat::bayerRG, Mirror::mirrorAll, pR, pG, pB); src0 = Mat(height /2, width /2, CV_8UC1, pR); src90 = Mat(height /2, width /2, CV_8UC1, pB); Mat srcG = Mat(height / 2, width / 2, CV_8UC1, pG); absdiff(src90, src0, dst90_0); //释放 if (pR!= nullptr) {
_aligned_free(pR); pR= nullptr; } if (pG!= nullptr) { _aligned_free(pG); pG= nullptr; } if (pB!= nullptr) { _aligned_free(pB); pB= nullptr; } } return 0; }
封装的函数:
#include <intrin.h> //for sse #include <string.h> //for memcpy enum BayerFormat { bayerRG, bayerGR, bayerBG, bayerGB }; enum Mirror { mirrorNo, //不镜像 mirrorTB, //上下镜像 mirrorLR, //左右镜像 mirrorAll //全镜像 }; //使用要求:宽度为32或16的整数倍,高度为2的整数倍 int bayer2rgb_CPU(const unsigned char* pBayer, int nWidth, int nHeight, BayerFormat nBayerFormat, Mirror nMirror, unsigned char* pR, unsigned char* pG, unsigned char* pB) { if (nWidth % 32 == 0 && nHeight % 2 == 0) { __m256i shuffle_oe = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m128i shuffle_reserseOrder = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); //用于左右镜像 int index = 0; for (int row2 = 0; row2 < nHeight / 2; row2++) { for (int col32 = 0; col32 < nWidth / 32; col32++) { __m256i line1 = _mm256_stream_load_si256((__m256i*)(pBayer + nWidth*row2 * 2) + col32); __m256i line2 = _mm256_stream_load_si256((__m256i*)(pBayer + nWidth*(row2 * 2 + 1)) + col32); __m256i line1_128oe = _mm256_shuffle_epi8(line1, shuffle_oe);//前16字节与后16字节是分开处理的,得到:前16字节的奇数位元素A、前16字节的偶数位元素B、后16字节的奇数位元素C、后16字节的偶数位元素D __m256i line2_128oe = _mm256_shuffle_epi8(line2, shuffle_oe); __m256i line1_oe = _mm256_permute4x64_epi64(line1_128oe, 0b11011000);//将ABCD重排,得到ACBD,即32字节里所有奇数位元素E、所有偶数位元素F __m256i line2_oe = _mm256_permute4x64_epi64(line2_128oe, 0b11011000); __m128i line11 = _mm256_extracti128_si256(line1_oe, 0); //得到EF中的E __m128i line12 = _mm256_extracti128_si256(line1_oe, 1); //得到EF中的F __m128i line21 = _mm256_extracti128_si256(line2_oe, 0); __m128i line22 = _mm256_extracti128_si256(line2_oe, 1); switch (nMirror) { case mirrorNo: index = nWidth / 32 * row2 + col32; //不镜像 break; case mirrorTB: index = nWidth / 32 * (nHeight / 2 - 1 - row2) + col32; //上下镜像 break; case mirrorLR: index = nWidth / 32 * row2 + (nWidth / 32 - 1 - col32); //左右镜像 line11 = _mm_shuffle_epi8(line11, shuffle_reserseOrder); line12 = _mm_shuffle_epi8(line12, shuffle_reserseOrder); line21 = _mm_shuffle_epi8(line21, shuffle_reserseOrder); line22 = _mm_shuffle_epi8(line22, shuffle_reserseOrder); break; case mirrorAll: index = nWidth / 32 * (nHeight / 2 - 1 - row2) + (nWidth / 32 - 1 - col32); line11 = _mm_shuffle_epi8(line11, shuffle_reserseOrder); line12 = _mm_shuffle_epi8(line12, shuffle_reserseOrder); line21 = _mm_shuffle_epi8(line21, shuffle_reserseOrder); line22 = _mm_shuffle_epi8(line22, shuffle_reserseOrder); break; default: return -1; } switch (nBayerFormat) { case bayerRG: _mm_stream_si128((__m128i*)pR + index, line11); _mm_stream_si128((__m128i*)pB + index, line22); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line12, line21));//对g通道求均值 break; case bayerGR: _mm_stream_si128((__m128i*)pR + index, line12); _mm_stream_si128((__m128i*)pB + index, line21); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line11, line22));//对g通道求均值 break; case bayerBG: _mm_stream_si128((__m128i*)pR + index, line22); _mm_stream_si128((__m128i*)pB + index, line11); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line12, line21));//对g通道求均值 break; case bayerGB: _mm_stream_si128((__m128i*)pR + index, line21); _mm_stream_si128((__m128i*)pB + index, line12); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line11, line22));//对g通道求均值 break; default: return -1; } } } return 0; } else if (nWidth % 16 == 0 && nHeight % 2 == 0) //宽度为16的整数倍 { __m256i shuffle_oe = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m128i shuffle_reserseOrder = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); //用于左右镜像 int index = 0; int add8 = 0; //用于处理每行的头或尾8bit for (int row2 = 0; row2 < nHeight / 2; row2++) { for (int col16 = 0; col16 < (nWidth + 16) / 32; col16++) { __m128i line1 = _mm_stream_load_si128((__m128i*)(pBayer + nWidth*row2 * 2) + col16 * 2); __m128i line2 = _mm_stream_load_si128((__m128i*)(pBayer + nWidth*(row2 * 2 + 1)) + col16 * 2); __m128i line1_next, line2_next; if (col16 == (nWidth + 16) / 32 - 1) { line1_next = _mm_setzero_si128(); //每行的尾16字节补0 line2_next = _mm_setzero_si128(); } else { line1_next = _mm_stream_load_si128((__m128i*)(pBayer + nWidth*row2 * 2) + col16 * 2 + 1); line2_next = _mm_stream_load_si128((__m128i*)(pBayer + nWidth*(row2 * 2 + 1)) + col16 * 2 + 1); } __m256i line1n = _mm256_setr_m128i(line1, line1_next); //组合成32字节,批量处理 __m256i line2n = _mm256_setr_m128i(line2, line2_next); __m256i line1n_128oe = _mm256_shuffle_epi8(line1n, shuffle_oe); //前16字节与后16字节是分开处理的,得到:前16字节的奇数位元素A、前16字节的偶数位元素B、后16字节的奇数位元素C、后16字节的偶数位元素D __m256i line2n_128oe = _mm256_shuffle_epi8(line2n, shuffle_oe); __m256i line1n_oe = _mm256_permute4x64_epi64(line1n_128oe, 0b11011000); __m256i line2n_oe = _mm256_permute4x64_epi64(line2n_128oe, 0b11011000); __m128i line11 = _mm256_extracti128_si256(line1n_oe, 0); __m128i line12 = _mm256_extracti128_si256(line1n_oe, 1); __m128i line21 = _mm256_extracti128_si256(line2n_oe, 0); __m128i line22 = _mm256_extracti128_si256(line2n_oe, 1); switch (nMirror) { case mirrorNo: index = nWidth / 2 * row2 + 16 * col16; //不镜像 break; case mirrorTB: index = nWidth / 2 * (nHeight / 2 - 1 - row2) + 16 * col16; //上下镜像 break; case mirrorLR: index = nWidth / 2 * row2 + (nWidth / 2 - 16 * (col16 + 1)); //左右镜像 add8 = 8; line11 = _mm_shuffle_epi8(line11, shuffle_reserseOrder); line12 = _mm_shuffle_epi8(line12, shuffle_reserseOrder); line21 = _mm_shuffle_epi8(line21, shuffle_reserseOrder); line22 = _mm_shuffle_epi8(line22, shuffle_reserseOrder); break; case mirrorAll: index = nWidth / 2 * (nHeight / 2 - 1 - row2) + (nWidth / 2 - 16 * (col16 + 1)); add8 = 8; line11 = _mm_shuffle_epi8(line11, shuffle_reserseOrder); line12 = _mm_shuffle_epi8(line12, shuffle_reserseOrder); line21 = _mm_shuffle_epi8(line21, shuffle_reserseOrder); line22 = _mm_shuffle_epi8(line22, shuffle_reserseOrder); break; default: return -1; } switch (nBayerFormat) { case bayerRG: if (col16 == (nWidth + 16) / 32 - 1) { memcpy(pR + index + add8, (unsigned char*)&line11 + add8, 8 * sizeof(unsigned char)); memcpy(pB + index + add8, (unsigned char*)&line22 + add8, 8 * sizeof(unsigned char)); memcpy(pG + index + add8, (unsigned char*)&_mm_avg_epu8(line12, line21) + add8, 8 * sizeof(unsigned char)); } else { _mm_stream_si128((__m128i*)(pR + index), line11); _mm_stream_si128((__m128i*)(pB + index), line22); _mm_stream_si128((__m128i*)(pG + index), _mm_avg_epu8(line12, line21));//对g通道求均值 } break; case bayerGR: if (col16 == (nWidth + 16) / 32 - 1) { memcpy(pR + index + add8, (unsigned char*)&line12 + add8, 8 * sizeof(unsigned char)); memcpy(pB + index + add8, (unsigned char*)&line21 + add8, 8 * sizeof(unsigned char)); memcpy(pG + index + add8, (unsigned char*)&_mm_avg_epu8(line11, line22) + add8, 8 * sizeof(unsigned char)); } else { _mm_stream_si128((__m128i*)pR + index, line12); _mm_stream_si128((__m128i*)pB + index, line21); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line11, line22));//对g通道求均值 } break; case bayerBG: if (col16 == (nWidth + 16) / 32 - 1) { memcpy(pR + index + add8, (unsigned char*)&line22 + add8, 8 * sizeof(unsigned char)); memcpy(pB + index + add8, (unsigned char*)&line11 + add8, 8 * sizeof(unsigned char)); memcpy(pG + index + add8, (unsigned char*)&_mm_avg_epu8(line12, line21) + add8, 8 * sizeof(unsigned char)); } else { _mm_stream_si128((__m128i*)pR + index, line22); _mm_stream_si128((__m128i*)pB + index, line11); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line12, line21));//对g通道求均值 } break; case bayerGB: if (col16 == (nWidth + 16) / 32 - 1) { memcpy(pR + index + add8, (unsigned char*)&line21 + add8, 8 * sizeof(unsigned char)); memcpy(pB + index + add8, (unsigned char*)&line12 + add8, 8 * sizeof(unsigned char)); memcpy(pG + index + add8, (unsigned char*)&_mm_avg_epu8(line11, line22) + add8, 8 * sizeof(unsigned char)); } else { _mm_stream_si128((__m128i*)pR + index, line21); _mm_stream_si128((__m128i*)pB + index, line12); _mm_stream_si128((__m128i*)pG + index, _mm_avg_epu8(line11, line22));//对g通道求均值 } break; default: return -1; } } } return 0; } return -1; }