CPU指令集——bayer抽取r、g、b三通道
需求:在高帧率场景下,一般拿到的是bayer格式数据。图像处理时,一般会先插值成rgb,再拆分为单通道。如果可以直接bayer中抽出r、g、b,那效率将大大提升。
注意:抽取后r、g、b尺寸是原来的一半,没有做插值(插值只会让数据量变大,并没有引入有效信息)
效果:CPU指令集优化后,速度是传统算法的8倍左右。
应用举例:
#include<opencv.hpp> #include <Windows.h> int main() { cv::Mat img_bayerRG = cv::imread("1.bmp", 0); //单通道图像读取(1.bmp是bayerRG格式存储的单通道图像) const uint8_t *bayer = img_bayerRG.data; //指向bayerRG数据 int height = img_bayerRG.rows; int width = img_bayerRG.cols; uint8_t *r = new uint8_t[width*height / 4]; //抽完后尺寸为原来的1/2 uint8_t *g = new uint8_t[width*height / 4]; //g做特殊处理,2个g的均值合成1个g uint8_t *b = new uint8_t[width*height / 4]; LARGE_INTEGER nEndTime, nBeginTime, nFreq; double time; QueryPerformanceFrequency(&nFreq); QueryPerformanceCounter(&nBeginTime);//获取开始时刻计数值 for (int i = 0; i < 100; i++) { if (0) { bayer2rgb_CPU(bayer, width, height, BayerFormat::bayerRG, r, g, b); } else { const uint8_t *src[2]; size_t index = 0; for (int i = 0; i < height; i += 2) { //每两行中的,第一行和第二行 src[0] = bayer + width*i; src[1] = src[0] + width; for (int j = 0; j < width; j += 2) { *(r + index) = src[0][j]; *(b + index) = src[1][j + 1]; *(g + index) = (src[0][j + 1] + src[1][j]) / 2; index++; } } } } QueryPerformanceCounter(&nEndTime);//获取开始时刻计数值 time = (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;//ms(开始-停止)/频率即为秒数,精确到小数点后6位 printf("100次bayer2rgb耗时(ms): %f \n\n", time); cv::Mat R = cv::Mat(height / 2, width / 2, CV_8U, r); cv::Mat B = cv::Mat(height / 2, width / 2, CV_8U, b); cv::Mat G = cv::Mat(height / 2, width / 2, CV_8U, g); cv::waitKey(100000); delete[] r; delete[] g; delete[] b; return 0; }
函数封装:
#include <intrin.h> enum BayerFormat { bayerRG, bayerGR, bayerBG, bayerGB }; //使用要求,assert((nWidth % 32 == 0) && (nHeight % 2) == 0); void bayer2rgb_CPU(const unsigned char* pBayer, int nWidth,int nHeight,int nBayerFormat, unsigned char* pR, unsigned char* pG, unsigned char* pB) { __m256i shuffle_oe = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); for (int row2 = 0; row2 < nHeight / 2; row2++) { for (int col32 = 0; col32 < nWidth / 32; col32++) { __m256i line1 = _mm256_load_si256((__m256i*)(pBayer + nWidth*row2 * 2) + col32); __m256i line2 = _mm256_load_si256((__m256i*)(pBayer + nWidth*(row2 * 2 + 1)) + col32); __m256i line1_128oe = _mm256_shuffle_epi8(line1, shuffle_oe); //前16字节与后16字节是分开处理的,得到:前16字节的奇数位元素A、前16字节的偶数位元素B、后16字节的奇数位元素C、后16字节的偶数位元素D __m256i line2_128oe = _mm256_shuffle_epi8(line2, shuffle_oe); __m256i line1_oe = _mm256_permute4x64_epi64(line1_128oe, 0b11011000);//将ABCD重排,得到ACBD,即32字节里所有奇数位元素E、所有偶数位元素F __m256i line2_oe = _mm256_permute4x64_epi64(line2_128oe, 0b11011000); __m128i line11 = _mm256_extracti128_si256(line1_oe, 0); //得到EF中的E __m128i line12 = _mm256_extracti128_si256(line1_oe, 1); //得到EF中的F __m128i line21 = _mm256_extracti128_si256(line2_oe, 0); __m128i line22 = _mm256_extracti128_si256(line2_oe, 1); switch (nBayerFormat) { case bayerRG: _mm_storeu_si128((__m128i*)pR + nWidth / 32 * row2 + col32, line11); _mm_storeu_si128((__m128i*)pB + nWidth / 32 * row2 + col32, line22); _mm_storeu_si128((__m128i*)pG + nWidth / 32 * row2 + col32, _mm_avg_epu8(line12, line21));//对g通道求均值 break; case bayerGR: _mm_storeu_si128((__m128i*)pR + nWidth / 32 * row2 + col32, line12); _mm_storeu_si128((__m128i*)pB + nWidth / 32 * row2 + col32, line21); _mm_storeu_si128((__m128i*)pG + nWidth / 32 * row2 + col32, _mm_avg_epu8(line11, line22));//对g通道求均值 break; case bayerBG: _mm_storeu_si128((__m128i*)pR + nWidth / 32 * row2 + col32, line22); _mm_storeu_si128((__m128i*)pB + nWidth / 32 * row2 + col32, line11); _mm_storeu_si128((__m128i*)pG + nWidth / 32 * row2 + col32, _mm_avg_epu8(line12, line21));//对g通道求均值 break; case bayerGB: _mm_storeu_si128((__m128i*)pR + nWidth / 32 * row2 + col32, line21); _mm_storeu_si128((__m128i*)pB + nWidth / 32 * row2 + col32, line12); _mm_storeu_si128((__m128i*)pG + nWidth / 32 * row2 + col32, _mm_avg_epu8(line11, line22));//对g通道求均值 break; default: break; } } } }