图像梯度计算加速
常规方法:
1 void SobelAmplitude(Mat &sobelx, Mat &sobely, Mat &SobelXY) { 2 SobelXY = Mat::zeros(sobelx.size(), CV_32FC1); 3 for (int i = 0; i < SobelXY.rows; i++) { 4 float *data = SobelXY.ptr<float>(i); 5 short *datax = sobelx.ptr<short>(i); 6 short *datay = sobely.ptr<short>(i); 7 for (int j = 0; j < SobelXY.cols; j++) { 8 data[j] = sqrt(datax[j] * datax[j] + datay[j] * datay[j]); 9 } 10 } 11 // convertScaleAbs(SobelXY, SobelXY); 12 }
上述,图像在x和y方向上的梯度数据类型为CV_16S,图像若是uchar类型,可以用其他方法比如:
查表法:提供者:GiantPandaCV
unsigned char Table[65026]; for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f); for (int X = 0; X < Width; X++) { int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2]; int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2]; LinePD[X] = Table[min(GX * GX + GY * GY, 65025)]; }
SSE优化一下short类型图像:
void simd_Sobel(Mat &sobelx, Mat &sobely, Mat &SobelXY) { SobelXY = Mat::zeros(sobelx.size(), CV_32FC1); __m128i X, Y; __m128i XY; __m128 floats1, floats2; for (int i = 0; i < SobelXY.rows; i++) { float *data = SobelXY.ptr<float>(i); short *datax = sobelx.ptr<short>(i); short *datay = sobely.ptr<short>(i); int j; for (j = 0; j +4< SobelXY.cols+1; j = j + 4) { X = _mm_loadu_si128((__m128i*)(datax + j)); X = _mm_cvtepi16_epi32(X); //16->32 整形。 floats1 = _mm_cvtepi32_ps(X); //整型-> floats,得到dataX Y = _mm_loadu_si128((__m128i*)(datay + j)); Y = _mm_cvtepi16_epi32(Y); floats2 = _mm_cvtepi32_ps(Y); //得到dataY floats1 = _mm_mul_ps(floats1, floats1); floats2 = _mm_mul_ps(floats2, floats2); floats1 = _mm_add_ps(floats1, floats2); floats1 = _mm_sqrt_ps(floats1); _mm_storeu_ps(data + j, floats1); } } }
SEE优化,一次性处理8个数据:
因为输入的是short类型,一个short类型是16位,在上一版中为了使用_mm_add_ps和_mm_storeu_ps,
_mm_add_ps,可查表知道:
__m128 _mm_add_ps (__m128 a, __m128 b),
输入必须是__m128,也就是说必须要4个float类型,折腾了半天就为了把16位short数据变为32位float数据,把__m128i类型转化为__m128。
其实这样很不爽的,因为一个__m128i明明可以存放8个short数据,可为了转化为float数据,我们只放了4个short。
接下来优化的就是如何一次性在__m128中放8个short。
void simd_Sobel8(Mat &sobelx, Mat &sobely, Mat &SobelXY) { SobelXY = Mat::zeros(sobelx.size(), CV_32FC1); __m128i X, Y; __m128i XY; __m128 floats1, floats2; for (int i = 0; i < SobelXY.rows; i++) { float *_norm = SobelXY.ptr<float>(i); short *_dx = sobelx.ptr<short>(i); short *_dy = sobely.ptr<short>(i); int j; for (j = 0; j < SobelXY.cols -8; j = j + 8) { __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j)); __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j)); __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx); __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy); __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh)); floats1 = _mm_cvtepi32_ps(v_norm); //整型-> floats,得到dataX floats1 = _mm_sqrt_ps(floats1); _mm_storeu_ps(_norm + j, floats1); v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh)); floats2 = _mm_cvtepi32_ps(v_norm); //整型-> floats,得到dataX floats2 = _mm_sqrt_ps(floats2); _mm_storeu_ps(_norm + j + 4, floats2); } } }
AVX优化一下:
void simd_AVX_Sobel(Mat &sobelx, Mat &sobely, Mat &SobelXY) { SobelXY = Mat::zeros(sobelx.size(), CV_32FC1); __m128i x1,x2, y1, y2; __m256i X, Y; __m256i XY; __m256 floats,floats1, floats2; for (int i = 0; i < SobelXY.rows; i++) { float *data = SobelXY.ptr<float>(i); short *datax = sobelx.ptr<short>(i); short *datay = sobely.ptr<short>(i); int j; for (j = 0; j + 8< SobelXY.cols + 1; j = j + 8) { x1 = _mm_loadu_si128((__m128i*)(datax +j)); x1 = _mm_cvtepi16_epi32(x1); //16->32 整形。 x2 = _mm_loadu_si128((__m128i*)(datax + j+4)); x2 = _mm_cvtepi16_epi32(x2); X = _mm256_set_m128i(x2, x1); floats1 = _mm256_cvtepi32_ps(X); //整型-> floats,得到dataX y1 = _mm_loadu_si128((__m128i*)(datay + j)); y1 = _mm_cvtepi16_epi32(y1); //16->32 整形。 y2 = _mm_loadu_si128((__m128i*)(datay + j + 4)); y2 = _mm_cvtepi16_epi32(y2); Y = _mm256_set_m128i(y2, y1); floats2 = _mm256_cvtepi32_ps(Y); //整型-> floats,得到dataX floats1 = _mm256_mul_ps(floats1, floats1); floats2 = _mm256_mul_ps(floats2, floats2); floats = _mm256_add_ps(floats1, floats2); floats = _mm256_sqrt_ps(floats); _mm256_storeu_ps(data + j, floats); } } }
这里有个头疼的问题,short类型数据转化为32位整型数据,AVX没有相应指令,其他办法有没有呢?