Sobel算子的指令集加速例子

Sobel算子的卷积核如下。这里以x轴向右为正；y轴向下为正：

$${\mathbf{D_{x}}=\bigl(\begin{smallmatrix} -1 & 0 & 1 \\ -2 & 0 & 2 \\ -1 & 0 & 1 \end{smallmatrix}\bigr), \mathbf{D_{y}}=\bigl(\begin{smallmatrix} -1 & -2 & -1 \\ 0 & 0 & 0 \\ 1 & 2 & 1 \end{smallmatrix}\bigr),\mathbf{Z}=\sqrt{\left(Image\ast\mathbf{D_{x}}\right)^{2}+\left(Image\ast\mathbf{D_{y}}\right)^{2}}}$$

下面直接给出我做的指令集加速的代码，计算的是梯度强度图${\mathbf{Z}}$，数据类型是CV_16UC1，图片边缘是按照REFLECT101方式处理的。内部计算方法是边缘位置用普通C++代码处理，而中间的大片区域用SSE加速。代码基于VS2017、OpenCV430和Qt5.9。CPU型号是Intel Core i5-7400。经过对比在Release版程序中该方法比同功能的OpenCV代码快。当然不同的CPU对指令集的支持效率不同，而且在某些情况下OpenCV会使用多线程加速，所以有可能在别的电脑上OpenCV更快。这里的比较结果仅供参考。代码如下：

void lineF1L1(const Mat& image, Mat& output)
{
    const int lastCol = image.cols - 1;
    const int lastRow = image.rows - 1;
    /* 第一行 */
    const uchar* iptr1 = image.ptr<uchar>(0);
    const uchar* iptr2 = image.ptr<uchar>(1);
    ushort* optr = output.ptr<ushort>(0);
    optr[0] = 0;
    optr[lastCol] = 0;
    for (int i = 1; i < lastCol; i++)
    {
        optr[i] = abs((iptr1[i + 1] - iptr1[i - 1]) + (iptr2[i + 1] - iptr2[i - 1])) * 2;
    }
    /* 最后一行 */
    iptr1 = image.ptr<uchar>(lastRow - 1);
    iptr2 = image.ptr<uchar>(lastRow);
    optr = output.ptr<ushort>(lastRow);
    optr[0] = 0;
    optr[lastCol] = 0;
    for (int i = 1; i < lastCol; i++)
    {
        optr[i] = abs((iptr1[i + 1] - iptr1[i - 1]) + (iptr2[i + 1] - iptr2[i - 1])) * 2;
    }
}

void sobel(const Mat& image, Mat& output)
{
    output.create(image.size(), CV_16UC1);
    lineF1L1(image, output);
    const int lastRow = image.rows - 1;
    const int lastCol = image.cols - 1;
    const int max16 = std::max(0, image.cols / 16 * 16 - 8);
    __m128i shift08 = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1);
    __m128i shift16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1);
    __m128i picklo = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
    __m128i pickhi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13);
    __m128i extr47 = _mm_setr_epi8(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
    for (int i = 1; i < lastRow; i++)
    {
        const uchar* iptr1 = image.ptr<uchar>(i - 1);
        const uchar* iptr2 = image.ptr<uchar>(i);
        const uchar* iptr3 = image.ptr<uchar>(i + 1);
        ushort* optr = output.ptr<ushort>(i);
        /* 第一列 */
        optr[0] = abs((iptr3[0] - iptr1[0]) + (iptr3[1] - iptr1[1])) * 2;
        /* 第二列到max16+1 */
        for (int j = 0; j < max16; j += 8)
        {
            __m128i pixels1 = _mm_loadu_si128((__m128i*)&iptr1[j]);
            __m128i pixels2 = _mm_loadu_si128((__m128i*)&iptr2[j]);
            __m128i pixels3 = _mm_loadu_si128((__m128i*)&iptr3[j]);

            __m128i i16rc11 = _mm_cvtepu8_epi16(pixels1); /* 第1行第1列像素 */
            __m128i r1shift08 = _mm_shuffle_epi8(pixels1, shift08);
            __m128i i16rc12 = _mm_cvtepu8_epi16(r1shift08); /* 第1行第2列像素 */
            __m128i r1shift16 = _mm_shuffle_epi8(pixels1, shift16);
            __m128i i16rc13 = _mm_cvtepu8_epi16(r1shift16); /* 第1行第3列像素 */

            __m128i i16rc21 = _mm_cvtepu8_epi16(pixels2); /* 第2行第1列像素 */
            __m128i r2shift16 = _mm_shuffle_epi8(pixels2, shift16);
            __m128i i16rc23 = _mm_cvtepu8_epi16(r2shift16); /* 第2行第3列像素 */

            __m128i i16rc31 = _mm_cvtepu8_epi16(pixels3); /* 第3行第1列像素 */
            __m128i r3shift08 = _mm_shuffle_epi8(pixels3, shift08);
            __m128i i16rc32 = _mm_cvtepu8_epi16(r3shift08); /* 第3行第2列像素 */
            __m128i r3shift16 = _mm_shuffle_epi8(pixels3, shift16);
            __m128i i16rc33 = _mm_cvtepu8_epi16(r3shift16); /* 第3行第3列像素 */

            __m128i haddy1 = _mm_add_epi16(_mm_add_epi16(i16rc11, i16rc13), _mm_add_epi16(i16rc12, i16rc12));
            __m128i haddy3 = _mm_add_epi16(_mm_add_epi16(i16rc31, i16rc33), _mm_add_epi16(i16rc32, i16rc32));
            __m128i haddy = _mm_sub_epi16(haddy3, haddy1);
            __m128i haddx1 = _mm_add_epi16(_mm_add_epi16(i16rc11, i16rc31), _mm_add_epi16(i16rc21, i16rc21));
            __m128i haddx3 = _mm_add_epi16(_mm_add_epi16(i16rc13, i16rc33), _mm_add_epi16(i16rc23, i16rc23));
            __m128i haddx = _mm_sub_epi16(haddx3, haddx1);

            __m128 dy1 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(haddy));
            __m128 dx1 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(haddx));
            __m128 apex1 = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx1, dx1), _mm_mul_ps(dy1, dy1)));
            __m128i dz1 = _mm_cvtps_epi32(apex1);
            __m128i le = _mm_shuffle_epi8(dz1, picklo); /* 此处是i32到u16的强制转换，只取低地址2字节 */

            __m128 dy2 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_shuffle_epi8(haddy, extr47)));
            __m128 dx2 = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_shuffle_epi8(haddx, extr47)));
            __m128 apex2 = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx2, dx2), _mm_mul_ps(dy2, dy2)));
            __m128i dz2 = _mm_cvtps_epi32(apex2);
            __m128i ri = _mm_shuffle_epi8(dz2, pickhi); /* 此处是i32到u16的强制转换，只取低地址2字节 */
            __m128i result = _mm_blend_epi16(le, ri, 0b11110000);
            _mm_storeu_si128((__m128i*)&optr[j + 1], result);
        }
        /* max16+1到倒数第二列 */
        for (int j = max16 + 1; j < lastCol; j++)
        {
            int dx = (iptr1[j + 1] - iptr1[j - 1]) + 2 * (iptr2[j + 1] - iptr2[j - 1]) + (iptr3[j + 1] - iptr3[j - 1]);
            int dy = (iptr3[j - 1] - iptr1[j - 1]) + 2 * (iptr3[j] - iptr1[j]) + (iptr3[j + 1] - iptr1[j + 1]);
            float dz = sqrtf(dx * dx + dy * dy);
            optr[j] = ushort(dz);
        }
        /* 最后一列 */
        optr[lastCol] = abs((iptr3[lastCol - 1] - iptr1[lastCol - 1]) + (iptr3[lastCol] - iptr1[lastCol])) * 2;
    }
}

void main()
{
    Mat image = imread(R"(2.bmp)", IMREAD_GRAYSCALE);
    resize(image, image, Size(960, 640));
    int64 t1, t2;

    Mat result;
    t1 = getTickCount();
    sobel(image, result);
    t2 = getTickCount();
    qDebug() << u8"diy(ms):" << (t2 - t1) / getTickFrequency() * 1000;

    Mat dx, dy, grad;
    t1 = getTickCount();
    spatialGradient(image, dx, dy);
    dx.convertTo(dx, CV_32FC1);
    dy.convertTo(dy, CV_32FC1);
    sqrt(dx.mul(dx) + dy.mul(dy), grad);
    t2 = getTickCount();
    qDebug() << u8"cv(ms)" << (t2 - t1) / getTickFrequency() * 1000;

    double maxv1, maxv2;
    minMaxIdx(result, 0, &maxv1);
    minMaxIdx(grad, 0, &maxv2);
    qDebug() << "diy max:" << maxv1 << "cv max:" << maxv2;

    threshold(result, result, 100, 65535, THRESH_BINARY);
    threshold(grad, grad, 100, 65535, THRESH_BINARY);

    imshow("sobel diy", result);
    imshow("sobel cv", grad);
    int x = 0;
}

下面是文本输出：

diy(ms): 0.7707
cv(ms) 9.0627
diy max: 600 cv max: 600

这里是输出结果图片，可以看出我的结果跟OpenCV相同：

posted @ 2023-04-26 10:20 兜尼完阅读(225) 评论(0) 编辑收藏举报

刷新页面返回顶部

孟祥度的博客

这是人生的转折点，通向一个新世界

Sobel算子的指令集加速例子