jibinghu

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

AVX加速卷积part2

重新构筑下昨天的想法:
问题:

源程序在O2下的执行时间:

经过AVX改进后的执行时间:

下面尝试在AVX2基础上改进:
AVX与AVX2的主要区别和改进:

向量整数指令:
AVX主要集中在浮点数运算上,提供了对256位宽SIMD(单指令多数据)向量的支持。
AVX2引入了向量整数运算的支持。这意味着,除了浮点运算外,AVX2还可以对256位的整数向量执行各种运算,这在处理图形、视频处理和数字信号处理等领域特别有用。
FMA(Fused Multiply-Add)指令:
虽然FMA指令首次出现在AVX中(作为AVX的扩展,称为FMA3),但AVX2进一步加强了对这些指令的支持。FMA指令可以在单个操作中完成乘法和加法,从而提高了计算的效率和准确性。
Gather指令:
AVX2引入了gather指令,这允许从非连续内存位置加载数据到一个寄存器中。这对于处理稀疏矩阵和不规则数据结构特别有用,因为它可以减少必须手动编码的复杂内存访问模式。
向量化的位操作:
AVX2增加了对向量位操作的支持,使得位级并行处理成为可能,这在某些算法中可以极大提高性能。
扩展的整数指令集:
AVX2通过扩展的整数指令集,提供了更多处理整数向量的能力,如增加、减少、乘法等操作。

FMA func.

// conv_avx2.cpp

bool Convolve1D_Ks5_F64_AVX(double* __restrict__ y, const double* __restrict__ x, const double* __restrict__ kernel, int64_t num_pts) {
    constexpr int64_t kernel_size = 5;
    constexpr int64_t ks2 = kernel_size / 2;

    // Ensure the input size is sufficient for the convolution operation
    if (num_pts < kernel_size) {
        return false;
    }

    // Load the convolution kernel into AVX2 registers
    __m256d k0 = _mm256_set1_pd(kernel[0]);
    __m256d k1 = _mm256_set1_pd(kernel[1]);
    __m256d k2 = _mm256_set1_pd(kernel[2]);
    __m256d k3 = _mm256_set1_pd(kernel[3]);
    __m256d k4 = _mm256_set1_pd(kernel[4]);

    // Perform convolution operations in chunks using AVX2
    for (int64_t i = ks2; i <= num_pts - kernel_size; i += 4) {
        // Load chunks of the input signal into AVX2 registers
        // std::cout << i << '\n';
        __m256d x0 = _mm256_loadu_pd(&x[i + 2]);
        __m256d x1 = _mm256_loadu_pd(&x[i + 1]);
        __m256d x2 = _mm256_loadu_pd(&x[i]);
        __m256d x3 = _mm256_loadu_pd(&x[i - 1]);
        __m256d x4 = _mm256_loadu_pd(&x[i - 2]);

        // Perform element-wise multiplication and sum the results
        __m256d y_val = _mm256_setzero_pd();
        y_val = _mm256_fmadd_pd(x0,k0,y_val);
        y_val = _mm256_fmadd_pd(x1,k1,y_val);
        y_val = _mm256_fmadd_pd(x2,k2,y_val);
        y_val = _mm256_fmadd_pd(x3,k3,y_val);
        y_val = _mm256_fmadd_pd(x4,k4,y_val);

        // Store the convolution results back to the output array
        _mm256_storeu_pd(&y[i], y_val);
    }

    // Handle the remainder of the elements that don't fit into a multiple of 4
    for (int64_t i = num_pts - ks2; i < num_pts; i++) {
        double result = 0.0;
        for (int64_t k = -ks2; k <= ks2; k++) {
            result += x[i - k] * kernel[k + ks2];
        }
        y[i] = result;
    }

    return true;
}

在AVX512指令集下进行改进:

// conv_avx512.cpp

bool Convolve1D_Ks5_F64_AVX512(double* __restrict__ y, const double* __restrict__ x, const double* __restrict__ kernel, int64_t num_pts) {
    constexpr int64_t kernel_size = 5;
    constexpr int64_t ks2 = kernel_size / 2;

    // Ensure the input size is sufficient for the convolution operation
    if (num_pts < kernel_size) {
        return false;
    }

    // Load the convolution kernel into AVX512 registers
    __m512d k0 = _mm512_set1_pd(kernel[0]);
    __m512d k1 = _mm512_set1_pd(kernel[1]);
    __m512d k2 = _mm512_set1_pd(kernel[2]);
    __m512d k3 = _mm512_set1_pd(kernel[3]);
    __m512d k4 = _mm512_set1_pd(kernel[4]);

    // Perform convolution operations in chunks using AVX512
    for (int64_t i = ks2; i <= num_pts - kernel_size; i += 8) { // Note: Change stride to 8 for AVX512
        // Load chunks of the input signal into AVX512 registers
        __m512d x0 = _mm512_loadu_pd(&x[i + 2]);
        __m512d x1 = _mm512_loadu_pd(&x[i + 1]);
        __m512d x2 = _mm512_loadu_pd(&x[i]);
        __m512d x3 = _mm512_loadu_pd(&x[i - 1]);
        __m512d x4 = _mm512_loadu_pd(&x[i - 2]);

        // Perform element-wise multiplication and sum the results
        __m512d y_val = _mm512_setzero_pd();
        y_val = _mm512_fmadd_pd(x0, k0, y_val);
        y_val = _mm512_fmadd_pd(x1, k1, y_val);
        y_val = _mm512_fmadd_pd(x2, k2, y_val);
        y_val = _mm512_fmadd_pd(x3, k3, y_val);
        y_val = _mm512_fmadd_pd(x4, k4, y_val);

        // Store the convolution results back to the output array
        _mm512_storeu_pd(&y[i], y_val);
    }

    // Handle the remainder of the elements that don't fit into a multiple of 8
    for (int64_t i = num_pts - ks2; i < num_pts; i++) {
        double result = 0.0;
        for (int64_t k = -ks2; k <= ks2; k++) {
            result += x[i - k] * kernel[k + ks2];
        }
        y[i] = result;
    }

    return true;
}

执行结果:

Pthreads库学习


posted on 2024-04-09 11:13  zombie_black  阅读(13)  评论(0编辑  收藏  举报