基于边缘模板匹配算法加速
印度小哥写的基于边缘的模板匹配算法:基于边缘的模板匹配 - 徐唱 - 博客园 (cnblogs.com),估计无需多数。
其中核心匹配算法为:
// stoping criterias to search for model double normMinScore = minScore / noOfCordinates; // precompute minumum score double normGreediness = ((1- greediness * minScore)/(1-greediness)) /noOfCordinates; // precompute greedniness for( i = 0; i < Ssize.height; i++ ) { vector<double> gradMag; gradMag.clear(); for( j = 0; j < Ssize.width; j++ ) { iSx = (double)Sdx.at<short>(i,j); // X derivative of Source image iSy = (double)Sdy.at<short>(i,j); // Y derivative of Source image gradMag1=sqrt((iSx*iSx)+(iSy*iSy)); //Magnitude = Sqrt(dx^2 +dy^2) if(gradMag1!=0) // hande divide by zero gradMag.push_back(1/gradMag1); // 1/Sqrt(dx^2 +dy^2) else gradMag.push_back(0); } matGradMag.push_back(gradMag); } for( i = 0; i < Ssize.height; i++ ) { for( j = 0; j < Ssize.width; j++ ) { partialSum = 0; // initilize partialSum measure for(m=0;m<noOfCordinates;m++) { curX = i + cordinates[m].x ; // template X coordinate curY = j + cordinates[m].y ; // template Y coordinate iTx = edgeDerivativeX[m]; // template X derivative iTy = edgeDerivativeY[m]; // template Y derivative if(curX<0 ||curY<0||curX>Ssize.height-1 ||curY>Ssize.width-1) continue; short* _Sdx = Sdx.ptr<short>(curX); short* _Sdy = Sdy.ptr<short>(curX); iSx=_Sdx[curY]; // get curresponding X derivative from source image iSy=_Sdy[curY]; // get curresponding Y derivative from source image if((iSx!=0 || iSy!=0) && (iTx!=0 || iTy!=0)) { //partial Sum = Sum of(((Source X derivative* Template X drivative) + Source Y derivative * Template Y derivative)) / Edge magnitude of(Template)* edge magnitude of(Source)) partialSum = partialSum + ((iSx*iTx)+(iSy*iTy))*(edgeMagnitude[m] * matGradMag[curX][curY]); } sumOfCoords = m + 1; partialScore = partialSum /sumOfCoords ; // check termination criteria // if partial score score is less than the score than needed to make the required score at that position // break serching at that coordinate. if( partialScore < (MIN((minScore -1) + normGreediness*sumOfCoords,normMinScore* sumOfCoords))) break; } if(partialScore > resultScore) { resultScore = partialScore; // Match score resultPoint->x = i; // result coordinate X resultPoint->y = j; // result coordinate Y } } }
看着很复杂,可以优化一下:
float resultScore = 0;
int length = gradData.size();
float NormGreediness = (1 - greediness * minScore) / (1 - greediness) / length;
float NormMinScore = minScore / length;
int border_left = templae_width_ / 2;
int border_top = templae_heigth_ / 2;
cv::Mat sobel_x, sobel_y, sobel_xy;
cv::Sobel(src, sobel_x, CV_16SC1, 1, 0, 3);
cv::Sobel(src, sobel_y, CV_16SC1, 0, 1, 3);
AVXSobelXY(sobel_x, sobel_y, sobel_xy);
cv::Mat result = cv::Mat(src.size(), CV_32FC1, cv::Scalar(0.0));
short* pSobleX = nullptr;
short* pSobleY = nullptr;
float* pSobleXY = nullptr;
if (sobel_x.isContinuous() && sobel_y.isContinuous()) {
pSobleX = (short*)sobel_x.ptr();
pSobleY = (short*)sobel_y.ptr();
pSobleXY = (float*)sobel_xy.ptr();
concurrency::parallel_for(0, static_cast<int>(src.rows ), [&](int y) {
float* sp_result = result.ptr<float>(y); for (uint x = 0; x < src.cols; x++) { for (uint index = 0; index < length; index += 1) { int curX = x + modelContourX[index]; int curY = y + modelContourY[index]; sum++; uint l = curY * sobel_x.cols + curX; float gx = pSobleX[l]; float gy = pSobleY[l]; float grad = pSobleXY[l]; if (abs(gx) > 1e-7 || abs(gy) > 1e-7) { partialScore += ((gx * modelGradX[index] + gy * modelGradY[index])) / grad; } score = partialScore / sum; if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum))) break; } *(sp_result + x) = score; } });
效果还不错,加速了很多,不过还不够呀。
继续优化:
float resultScore = 0; int length = gradData.size(); float NormGreediness = (1 - greediness * minScore) / (1 - greediness) / length; float NormMinScore = minScore / length; int border_left = templae_width_ / 2; int border_top = templae_heigth_ / 2; cv::Mat sobel_x, sobel_y, sobel_xy; cv::Sobel(src, sobel_x, CV_16SC1, 1, 0, 3); cv::Sobel(src, sobel_y, CV_16SC1, 0, 1, 3); cv::Mat result = cv::Mat(src.size(), CV_32FC1, cv::Scalar(0.0)); short* pSobleX = nullptr; short* pSobleY = nullptr; float* pSobleXY = nullptr; if (sobel_x.isContinuous() && sobel_y.isContinuous()) { pSobleX = (short*)sobel_x.ptr(); pSobleY = (short*)sobel_y.ptr(); pSobleXY = (float*)sobel_xy.ptr(); } int count = length / SSEStep; int count2 = length & (SSEStep - 1); concurrency::parallel_for(border_top, static_cast<int>(src.rows - border_top), [&](int y) { float* sp_result = result.ptr<float>(y); for (uint x = border_left; x < src.cols - border_left; x++) { float partialScore = 0; float score = 0; uint sum = 0; __m256 _x = _mm256_set1_ps(x); __m256 _y = _mm256_set1_ps(y); __m256i _curX = _mm256_setzero_si256(); __m256i _curY = _mm256_setzero_si256(); __m256 _gx = _mm256_set1_ps(0.0f); __m256 _gy = _mm256_set1_ps(0.0f); __m256 _gxy = _mm256_set1_ps(0.0f); __m256 com_zero = _mm256_set1_ps(1e-5); __m256 flt_nan = _mm256_set1_ps(FLT_MAX); __m256i local = _mm256_setzero_si256(); for (uint index = 0; index < length - count2; index += SSEStep) { _curX = _mm256_cvttps_epi32(_mm256_add_ps(_x, _mm256_load_ps(modelContourX + index))); _curY = _mm256_cvttps_epi32(_mm256_add_ps(_y, _mm256_load_ps(modelContourY + index))); local = _mm256_add_epi32(_curX, _mm256_mullo_epi32(_curY, _mm256_set1_epi32(sobel_x.cols))); _gx.m256_f32[0] = pSobleX[local.m256i_i32[0]]; _gy.m256_f32[0] = pSobleY[local.m256i_i32[0]]; _gx.m256_f32[1] = pSobleX[local.m256i_i32[1]]; _gy.m256_f32[1] = pSobleY[local.m256i_i32[1]]; _gx.m256_f32[2] = pSobleX[local.m256i_i32[2]]; _gy.m256_f32[2] = pSobleY[local.m256i_i32[2]]; _gx.m256_f32[3] = pSobleX[local.m256i_i32[3]]; _gy.m256_f32[3] = pSobleY[local.m256i_i32[3]]; _gx.m256_f32[4] = pSobleX[local.m256i_i32[4]]; _gy.m256_f32[4] = pSobleY[local.m256i_i32[4]]; _gx.m256_f32[5] = pSobleX[local.m256i_i32[5]]; _gy.m256_f32[5] = pSobleY[local.m256i_i32[5]]; _gx.m256_f32[6] = pSobleX[local.m256i_i32[6]]; _gy.m256_f32[6] = pSobleY[local.m256i_i32[6]]; _gx.m256_f32[7] = pSobleX[local.m256i_i32[7]]; _gy.m256_f32[7] = pSobleY[local.m256i_i32[7]]; __m256 _graddot = _mm256_add_ps(_mm256_mul_ps(_gx, _mm256_load_ps(modelGradX + index)), _mm256_mul_ps(_gy, _mm256_load_ps(modelGradY + index))); __m256 _addgrad = _mm256_add_ps(_mm256_mul_ps(_gx, _gx), _mm256_mul_ps(_gy, _gy)); __m256 cmp = _mm256_cmp_ps(_addgrad, com_zero, _CMP_LT_OQ); __m256 blend = _mm256_blendv_ps(_addgrad, flt_nan, cmp); __m256 _grad = _mm256_rsqrt_ps(blend); __m256 _value = _mm256_mul_ps(_graddot, _grad); for (uchar k = 0; k < SSEStep; k++) { sum++; partialScore += _value.m256_f32[k]; score = partialScore / sum; if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum))) goto Next; } } for (uint index = length - count2; index < length; index++) { sum++; uint curX = x + modelContourX[index]; uint curY = y + modelContourY[index]; uint l = curY * sobel_x.cols + curX; float gx = pSobleX[l]; float gy = pSobleY[l]; float grad = sqrt(gx * gx + gy * gy); if (grad > 1e-5) { partialScore += ((gx * modelGradX[index] + gy * modelGradY[index])) / grad; } score = partialScore / sum; if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum))) break; } Next: *(sp_result + x) = score; } });
速度,很快。这张图若是不降采样需要30ms,降采样一倍的化,只需要10+ms。
应该还可以优化吧。
float resultScore = 0; int length = gradData.size(); float NormGreediness = (1 - greediness * minScore) / (1 - greediness) / length; float NormMinScore = minScore / length; int border_left = templae_width_ / 2; int border_top = templae_heigth_ / 2; cv::Mat sobel_x, sobel_y, sobel_xy; cv::Sobel(src, sobel_x, CV_16SC1, 1, 0, 3); cv::Sobel(src, sobel_y, CV_16SC1, 0, 1, 3); cv::Mat result = cv::Mat(src.size(), CV_32FC1, cv::Scalar(0.0)); short* pSobleX = nullptr; short* pSobleY = nullptr; float* pSobleXY = nullptr; if (sobel_x.isContinuous() && sobel_y.isContinuous()) { pSobleX = (short*)sobel_x.ptr(); pSobleY = (short*)sobel_y.ptr(); pSobleXY = (float*)sobel_xy.ptr(); } int count = length / SSEStep; int count2 = length & (SSEStep - 1); concurrency::parallel_for(border_top, static_cast<int>(src.rows - border_top), [&](int y) { float* sp_result = result.ptr<float>(y); for (uint x = border_left; x < src.cols - border_left; x++) { float partialScore = 0; float score = 0; uint sum = 0; __m512 _x = _mm512_set1_ps(x); __m512 _y = _mm512_set1_ps(y); __m512i _curX = _mm512_setzero_si512(); __m512i _curY = _mm512_setzero_si512(); __m512 _gx = _mm512_set1_ps(0.0f); __m512 _gy = _mm512_set1_ps(0.0f); __m512 _gxy = _mm512_set1_ps(0.0f); __m512 com_zero = _mm512_set1_ps(1e-5); __m512 flt_nan = _mm512_set1_ps(FLT_MAX); __m512i local = _mm512_setzero_si512(); __m512i _cols = _mm512_set1_epi32(sobel_x.cols); for (uint index = 0; index < length - count2; index += SSEStep) { _curX = _mm512_cvttps_epi32(_mm512_add_ps(_x, _mm512_load_ps(modelContourX + index))); _curY = _mm512_cvttps_epi32(_mm512_add_ps(_y, _mm512_load_ps(modelContourY + index))); local = _mm512_add_epi32(_curX, _mm512_mullo_epi32(_curY, _cols)); for (uchar k = 0; k < SSEStep; k++) { _gx.m512_f32[k] = pSobleX[local.m512i_i32[k]]; _gy.m512_f32[k] = pSobleY[local.m512i_i32[k]]; } __m512 _graddot = _mm512_add_ps(_mm512_mul_ps(_gx, _mm512_load_ps(modelGradX + index)), _mm512_mul_ps(_gy, _mm512_load_ps(modelGradY + index))); __m512 _addgrad = _mm512_add_ps(_mm512_mul_ps(_gx, _gx), _mm512_mul_ps(_gy, _gy)); __mmask16 cmp = _mm512_cmp_ps_mask(_addgrad, com_zero, _CMP_LT_OQ); __m512 blend = _mm512_mask_blend_ps(cmp,_addgrad, flt_nan); __m512 _grad = _mm512_sqrt_ps(blend); __m512 _value = _mm512_div_ps(_graddot, _grad); //__m512 _mm512_rsqrt28_ps(__m512 a), 报错 for (uchar k = 0; k < SSEStep; k++) { sum++; partialScore += _value.m512_f32[k]; score = partialScore / sum; if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum))) goto Next; } } for (uint index = length - count2; index < length; index++) { sum++; uint curX = x + modelContourX[index]; uint curY = y + modelContourY[index]; uint l = curY * sobel_x.cols + curX; float gx = pSobleX[l]; float gy = pSobleY[l]; float grad = sqrt(gx * gx + gy * gy); if (grad > 1e-5) { partialScore += ((gx * modelGradX[index] + gy * modelGradY[index])) / grad; } score = partialScore / sum; if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum))) break; } Next: * (sp_result + x) = score; } });