基于边缘模板匹配算法加速

印度小哥写的基于边缘的模板匹配算法：基于边缘的模板匹配 - 徐唱 - 博客园 (cnblogs.com)，估计无需多数。

其中核心匹配算法为：

// stoping criterias to search for model
    double normMinScore = minScore / noOfCordinates; // precompute minumum score
    double normGreediness = ((1- greediness * minScore)/(1-greediness)) /noOfCordinates; // precompute greedniness

    for( i = 0; i < Ssize.height; i++ )
    {
        vector<double> gradMag;
        gradMag.clear();
        for( j = 0; j < Ssize.width; j++ )
        {
                iSx = (double)Sdx.at<short>(i,j);  // X derivative of Source image
                iSy = (double)Sdy.at<short>(i,j);  // Y derivative of Source image

                gradMag1=sqrt((iSx*iSx)+(iSy*iSy)); //Magnitude = Sqrt(dx^2 +dy^2)

                if(gradMag1!=0) // hande divide by zero
                    gradMag.push_back(1/gradMag1);   // 1/Sqrt(dx^2 +dy^2)
                else
                    gradMag.push_back(0);
        }
        matGradMag.push_back(gradMag);
    }
    for( i = 0; i < Ssize.height; i++ )
    {
            for( j = 0; j < Ssize.width; j++ )
             {
                 partialSum = 0; // initilize partialSum measure
                 for(m=0;m<noOfCordinates;m++)
                 {
                     curX    = i + cordinates[m].x ;    // template X coordinate
                     curY    = j + cordinates[m].y ; // template Y coordinate
                     iTx    = edgeDerivativeX[m];    // template X derivative
                     iTy    = edgeDerivativeY[m];   // template Y derivative

                     if(curX<0 ||curY<0||curX>Ssize.height-1 ||curY>Ssize.width-1)
                         continue;

                     short* _Sdx = Sdx.ptr<short>(curX);
                     short* _Sdy = Sdy.ptr<short>(curX);

                     iSx=_Sdx[curY]; // get curresponding  X derivative from source image
                     iSy=_Sdy[curY]; // get curresponding  Y derivative from source image

                     if((iSx!=0 || iSy!=0) && (iTx!=0 || iTy!=0))
                     {
                         //partial Sum  = Sum of(((Source X derivative* Template X drivative) + Source Y derivative * Template Y derivative)) / Edge magnitude of(Template)* edge magnitude of(Source))
                         partialSum = partialSum + ((iSx*iTx)+(iSy*iTy))*(edgeMagnitude[m] * matGradMag[curX][curY]);
                     }

                    sumOfCoords = m + 1;
                    partialScore = partialSum /sumOfCoords ;
                    // check termination criteria
                    // if partial score score is less than the score than needed to make the required score at that position
                    // break serching at that coordinate.
                    if( partialScore < (MIN((minScore -1) + normGreediness*sumOfCoords,normMinScore*  sumOfCoords)))
                        break;
                }
                if(partialScore > resultScore)
                {
                    resultScore = partialScore; //  Match score
                    resultPoint->x = i;            // result coordinate X
                    resultPoint->y = j;            // result coordinate Y
                }
            }
        }

看着很复杂，可以优化一下：

float resultScore = 0;
int length = gradData.size();
float NormGreediness = (1 - greediness * minScore) / (1 - greediness) / length;
float NormMinScore = minScore / length;

int border_left = templae_width_ / 2;
int border_top = templae_heigth_ / 2;

cv::Mat sobel_x, sobel_y, sobel_xy;
cv::Sobel(src, sobel_x, CV_16SC1, 1, 0, 3);
cv::Sobel(src, sobel_y, CV_16SC1, 0, 1, 3);

AVXSobelXY(sobel_x, sobel_y, sobel_xy);

cv::Mat result = cv::Mat(src.size(), CV_32FC1, cv::Scalar(0.0));

short* pSobleX = nullptr;
short* pSobleY = nullptr;
float* pSobleXY = nullptr;
if (sobel_x.isContinuous() && sobel_y.isContinuous()) {
pSobleX = (short*)sobel_x.ptr();
pSobleY = (short*)sobel_y.ptr();
pSobleXY = (float*)sobel_xy.ptr();
concurrency::parallel_for(0, static_cast<int>(src.rows ), [&](int y) {

float* sp_result = result.ptr<float>(y);
        for (uint x = 0; x < src.cols; x++) {
            for (uint index = 0; index < length; index += 1) {
                int curX = x + modelContourX[index];
                int curY = y + modelContourY[index];
                sum++;
                uint l = curY * sobel_x.cols + curX;
                float gx = pSobleX[l];
                float gy = pSobleY[l];
                float grad = pSobleXY[l];

                if (abs(gx) > 1e-7 || abs(gy) > 1e-7) 
                {    
                    partialScore += ((gx * modelGradX[index] + gy * modelGradY[index])) / grad;
                }
                score = partialScore / sum;
                if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum)))
                    break;
            }
            *(sp_result + x) = score;
         }
      });

效果还不错，加速了很多，不过还不够呀。

继续优化：

    float resultScore = 0;
    int length = gradData.size();
    float NormGreediness = (1 - greediness * minScore) / (1 - greediness) / length;
    float NormMinScore = minScore / length;

    int border_left = templae_width_ / 2;
    int border_top = templae_heigth_ / 2;
    cv::Mat sobel_x, sobel_y, sobel_xy;
    cv::Sobel(src, sobel_x, CV_16SC1, 1, 0, 3);
    cv::Sobel(src, sobel_y, CV_16SC1, 0, 1, 3);

    cv::Mat result = cv::Mat(src.size(), CV_32FC1, cv::Scalar(0.0));

    short* pSobleX = nullptr;
    short* pSobleY = nullptr;
    float* pSobleXY = nullptr;
    if (sobel_x.isContinuous() && sobel_y.isContinuous()) {
        pSobleX = (short*)sobel_x.ptr();
        pSobleY = (short*)sobel_y.ptr();
        pSobleXY = (float*)sobel_xy.ptr();
    }

     
    int count = length / SSEStep;
    int count2 = length & (SSEStep - 1);
    concurrency::parallel_for(border_top, static_cast<int>(src.rows - border_top), [&](int y) {
        float* sp_result = result.ptr<float>(y);
        for (uint x = border_left; x < src.cols - border_left; x++) {
            float partialScore = 0;
            float score = 0;
            uint sum = 0; 
            __m256 _x = _mm256_set1_ps(x);
            __m256 _y = _mm256_set1_ps(y);
            __m256i _curX = _mm256_setzero_si256();
            __m256i _curY = _mm256_setzero_si256();
            __m256 _gx = _mm256_set1_ps(0.0f);
            __m256 _gy = _mm256_set1_ps(0.0f);
            __m256 _gxy = _mm256_set1_ps(0.0f);

            __m256 com_zero = _mm256_set1_ps(1e-5);
            __m256 flt_nan = _mm256_set1_ps(FLT_MAX);
            __m256i local = _mm256_setzero_si256();

            for (uint index = 0; index < length - count2; index += SSEStep) {
                _curX = _mm256_cvttps_epi32(_mm256_add_ps(_x, _mm256_load_ps(modelContourX + index)));
                _curY = _mm256_cvttps_epi32(_mm256_add_ps(_y, _mm256_load_ps(modelContourY + index)));

                 local = _mm256_add_epi32(_curX, _mm256_mullo_epi32(_curY, _mm256_set1_epi32(sobel_x.cols)));
                
                _gx.m256_f32[0] = pSobleX[local.m256i_i32[0]];
                _gy.m256_f32[0] = pSobleY[local.m256i_i32[0]];
                
                _gx.m256_f32[1] = pSobleX[local.m256i_i32[1]];
                _gy.m256_f32[1] = pSobleY[local.m256i_i32[1]];

                _gx.m256_f32[2] = pSobleX[local.m256i_i32[2]];
                _gy.m256_f32[2] = pSobleY[local.m256i_i32[2]];

                _gx.m256_f32[3] = pSobleX[local.m256i_i32[3]];
                _gy.m256_f32[3] = pSobleY[local.m256i_i32[3]];

                _gx.m256_f32[4] = pSobleX[local.m256i_i32[4]];
                _gy.m256_f32[4] = pSobleY[local.m256i_i32[4]];

                _gx.m256_f32[5] = pSobleX[local.m256i_i32[5]];
                _gy.m256_f32[5] = pSobleY[local.m256i_i32[5]];

                _gx.m256_f32[6] = pSobleX[local.m256i_i32[6]];
                _gy.m256_f32[6] = pSobleY[local.m256i_i32[6]];

                _gx.m256_f32[7] = pSobleX[local.m256i_i32[7]];
                _gy.m256_f32[7] = pSobleY[local.m256i_i32[7]];

                __m256 _graddot = _mm256_add_ps(_mm256_mul_ps(_gx, _mm256_load_ps(modelGradX + index)), _mm256_mul_ps(_gy, _mm256_load_ps(modelGradY + index)));
                __m256 _addgrad = _mm256_add_ps(_mm256_mul_ps(_gx, _gx), _mm256_mul_ps(_gy, _gy));
                __m256 cmp = _mm256_cmp_ps(_addgrad, com_zero, _CMP_LT_OQ);
                __m256 blend = _mm256_blendv_ps(_addgrad, flt_nan, cmp);
                __m256 _grad = _mm256_rsqrt_ps(blend);
                __m256 _value = _mm256_mul_ps(_graddot, _grad);

                for (uchar k = 0; k < SSEStep; k++) {
                    sum++;
                    partialScore += _value.m256_f32[k];
                    score = partialScore / sum;
                    if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum)))
                        goto Next;
                }
            }
            for (uint index = length - count2; index < length; index++) {
                sum++;
                uint curX = x + modelContourX[index];
                uint curY = y + modelContourY[index];
                uint l = curY * sobel_x.cols + curX;

                float gx = pSobleX[l];
                float gy = pSobleY[l];
                float grad = sqrt(gx * gx + gy * gy);

                if (grad > 1e-5) {    
                    partialScore += ((gx * modelGradX[index] + gy * modelGradY[index])) / grad;
                }
                score = partialScore / sum;
                if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum)))
                    break;
            }
        Next:
            *(sp_result + x) = score;
         }
});

速度，很快。这张图若是不降采样需要30ms，降采样一倍的化，只需要10+ms。

应该还可以优化吧。

    float resultScore = 0;
    int length = gradData.size();
    float NormGreediness = (1 - greediness * minScore) / (1 - greediness) / length;
    float NormMinScore = minScore / length;

    int border_left = templae_width_ / 2;
    int border_top = templae_heigth_ / 2;

    cv::Mat sobel_x, sobel_y, sobel_xy;
    cv::Sobel(src, sobel_x, CV_16SC1, 1, 0, 3);
    cv::Sobel(src, sobel_y, CV_16SC1, 0, 1, 3);
    cv::Mat result = cv::Mat(src.size(), CV_32FC1, cv::Scalar(0.0));

    short* pSobleX = nullptr;
    short* pSobleY = nullptr;
    float* pSobleXY = nullptr;
    if (sobel_x.isContinuous() && sobel_y.isContinuous()) {
        pSobleX = (short*)sobel_x.ptr();
        pSobleY = (short*)sobel_y.ptr();
        pSobleXY = (float*)sobel_xy.ptr();
    }


    int count = length / SSEStep;
    int count2 = length & (SSEStep - 1);
    concurrency::parallel_for(border_top, static_cast<int>(src.rows - border_top), [&](int y) {
        float* sp_result = result.ptr<float>(y);
        for (uint x = border_left; x < src.cols - border_left; x++) {
            float partialScore = 0;
            float score = 0;
            uint sum = 0; 
            __m512 _x = _mm512_set1_ps(x);
            __m512 _y = _mm512_set1_ps(y);
            __m512i  _curX = _mm512_setzero_si512();
            __m512i  _curY = _mm512_setzero_si512();
            __m512 _gx = _mm512_set1_ps(0.0f);
            __m512 _gy = _mm512_set1_ps(0.0f);
            __m512 _gxy = _mm512_set1_ps(0.0f);

            __m512 com_zero = _mm512_set1_ps(1e-5);
            __m512 flt_nan = _mm512_set1_ps(FLT_MAX);
            __m512i local = _mm512_setzero_si512();
            __m512i _cols = _mm512_set1_epi32(sobel_x.cols);

            for (uint index = 0; index < length - count2; index += SSEStep) {
                _curX = _mm512_cvttps_epi32(_mm512_add_ps(_x, _mm512_load_ps(modelContourX + index)));
                _curY = _mm512_cvttps_epi32(_mm512_add_ps(_y, _mm512_load_ps(modelContourY + index)));

                local = _mm512_add_epi32(_curX, _mm512_mullo_epi32(_curY, _cols));
                for (uchar k = 0; k < SSEStep; k++) {
                    _gx.m512_f32[k] = pSobleX[local.m512i_i32[k]];
                    _gy.m512_f32[k] = pSobleY[local.m512i_i32[k]];
                }

                __m512 _graddot = _mm512_add_ps(_mm512_mul_ps(_gx, _mm512_load_ps(modelGradX + index)), _mm512_mul_ps(_gy, _mm512_load_ps(modelGradY + index)));
                __m512 _addgrad = _mm512_add_ps(_mm512_mul_ps(_gx, _gx), _mm512_mul_ps(_gy, _gy));
                __mmask16 cmp = _mm512_cmp_ps_mask(_addgrad, com_zero, _CMP_LT_OQ);
                __m512 blend = _mm512_mask_blend_ps(cmp,_addgrad, flt_nan);
                __m512 _grad = _mm512_sqrt_ps(blend);
                __m512 _value = _mm512_div_ps(_graddot, _grad);
                //__m512 _mm512_rsqrt28_ps(__m512 a), 报错

                for (uchar k = 0; k < SSEStep; k++) {
                    sum++;
                    partialScore += _value.m512_f32[k];
                    score = partialScore / sum;
                    if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum)))
                        goto Next;
                }
            }
            for (uint index = length - count2; index < length; index++) {
                sum++;
                uint curX = x + modelContourX[index];
                uint curY = y + modelContourY[index];
                uint l = curY * sobel_x.cols + curX;

                float gx = pSobleX[l];
                float gy = pSobleY[l];
                float grad = sqrt(gx * gx + gy * gy);

                if (grad > 1e-5) {
                    partialScore += ((gx * modelGradX[index] + gy * modelGradY[index])) / grad;
                }
                score = partialScore / sum;
                if (score < (std::min((minScore - 1) + NormGreediness * sum, NormMinScore * sum)))
                    break;
            }
        Next:
            * (sp_result + x) = score;
        }
        });

posted @ 2023-06-06 15:15 盖世猪猪侠阅读(304) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

盖世猪猪侠

基于边缘模板匹配算法加速

公告