C# 使用SIMD系列方法加速批量运算

 

我们现在想做一些简单的批量运算,比如累乘得积,累加求和

public class NormalCalc
{
    public static double Multiply(double[] nums)
    {
        double result = 1.0d;

        for (int i = 0; i < nums.Length; i++)
        {
            result *= nums[i];
        }
        return result;
    }

    public static double AddTotal(double[] nums)
    {
        double result = 0.0d;

        for (int i = 0; i < nums.Length; i++)
        {
            result += nums[i];
        }
        return result;
    }
}

 

这种批量运算不正是指令集的优势么,那就试试吧

C#中可以使用Vector类来做宽位运算,我这里有avx2指令集,也就是256位,double是64位的,那就有4个,如果做int运算自然就有8个

在这里就是4个4个放到一个Vector里一起做乘法运算,最后把4拷贝到数组中互乘,再把多余的乘完就好了,乘法嘛,用1作为种子

 

    public unsafe static double Multiply(double[] nums)
    {
        int vectorSize = Vector<double>.Count;
        var accVector = Vector<double>.One;
        int i;
        var array = nums;
        double result = 1.0d;
        fixed (double* p = array)
        {
            for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
            {
                //var v = new Vector<double>(array, i);
                var v = Unsafe.Read<Vector<double>>(p + i);
                accVector = Vector.Multiply(accVector, v);
            }
        }
        var tempArray = new double[Vector<double>.Count];
        accVector.CopyTo(tempArray);
        for (int j = 0; j < tempArray.Length; j++)
        {
            result = result * tempArray[j];
        }

        for (; i < array.Length; i++)
        {
            result *= array[i];
        }

        return result;
    }

 

下一个问题就是,我总不见得又得每种数据类型都写一遍吧,咱有没有办法用C#的各种新特性写成泛型?咱有Span有预览特性INumber,试了下还真可以

用new Vector构造,泛型T用INumber约束就有了T.One来表示数字1,并且能随便的做乘法运算了

    public static T Multiply<T>(T[] nums) where T : struct, INumber<T>
    {
        int vectorSize = Vector<T>.Count;
        var accVector = Vector<T>.One;
        int i;
        var array = nums;
        T result = T.One;

        for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
        {
            var v = new Vector<T>(array, i);
            accVector = Vector.Multiply(accVector, v);
        }

        var tempArray = new T[Vector<T>.Count];
        accVector.CopyTo(tempArray);
        for (int j = 0; j < tempArray.Length; j++)
        {
            result = result * tempArray[j];
        }

        for (; i < array.Length; i++)
        {
            result *= array[i];
        }

        return result;
    }

 

理论上Span速度不会比指针快,new Vector不会比Unsafe.Read快,但是差不了太多,就能写成泛型方法

来测试一下速度:

 


//生成运算数组
double[] nums = new double[100000];
Random random = new Random();
for (int i = 0; i < nums.Length; i++)
{
    nums[i] = random.NextDouble() * 2.723;
}

//普通连乘
Stopwatch stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
    NormalCalc.Multiply(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);

//Vector
stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
    SIMD_Calc.Multiply(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);

//Vector+Span+INumber写成泛型
stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
    SIMD_Calc.MultiplySpan(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);

 

结果为:

730

185

190

 

不错不错,效果还挺满意的

 

接下来来个泛型的累加

    public static T AddTotal<T>(T[] nums) where T : struct, INumber<T>
    {
        int vectorSize = Vector<T>.Count;
        var accVector = Vector<T>.Zero;
        int i;
        var array = nums;
        for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
        {
            var v = new Vector<T>(array, i);
            accVector = Vector.Add(accVector, v);
        }
        T result = Vector.Dot(accVector, Vector<T>.One);
        for (; i < array.Length; i++)
        {
            result += array[i];
        }
        return result;
    }

还有一种方式不使用Vector,而是直接使用Avx2类下的方法做运算,需要加个是否支持的判断

    public unsafe static int AddTotal_Avx2(int[] nums)
    {
        if (Avx2.IsSupported)
        {
            int vectorSize = 256 / 8 / 4;
            var accVector = Vector256<int>.Zero;
            int i;
            var array = nums;
            fixed (int* ptr = array)
            {
                for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
                {
                    var v = Avx2.LoadVector256(ptr + i);
                    accVector = Avx2.Add(accVector, v);
                }
            }
            int result = 0;
            var temp = stackalloc int[vectorSize];
            Avx2.Store(temp, accVector);
            for (int j = 0; j < vectorSize; j++)
            {
                result += temp[j];
            }
            for (; i < array.Length; i++)
            {
                result += array[i];
            }
            return result;
        }
        else
        {
            throw new NotSupportedException();
        }
    }

但是我们要做好回落,比如没有avx2就用sse,没有sse就用普通的

不过System.Runtime.Intrinsics.X86下面一堆这种,再说了还有arm的,所以通用性不如Vector方法

    public unsafe static int AddTotal_2(int[] nums)
    {
        if (Avx2.IsSupported)
        {
            return AddTotal_Avx2(nums);
        }
        else if (Sse2.IsSupported)
        {
            return AddTotal_Sse2(nums);
        }
        else
        {
            return NormalCalc.AddTotal(nums);
        }
    }

 

把Vector用在两个数组相加相乘上会更加简单

    public unsafe static T[] Multiply<T>(T[] numsl, T[] numsr) where T : struct,INumber<T>
    {
        if (numsl.Length != numsr.Length)
        {
            throw new ArgumentException();
        }

        T[] result = new T[numsl.Length];
        int vectorSize = Vector<T>.Count;
        int i;

        for (i = 0; i <= numsl.Length - vectorSize; i += vectorSize)
        {
            var l = new Vector<T>(numsl, i);
            var r = new Vector<T>(numsr, i);
            var multiplied = Vector.Multiply(l, r);
            //for (int j = i; j < i + vectorSize; j++)
            //{
            //    result[j] = multiplied[j % vectorSize];
            //}
            multiplied.CopyTo(result, i);
        }

        for (; i < numsl.Length; i++)
        {
            result[i] = numsl[i] * numsr[i];
        }
        return result;
    }

 

批量加1

普通方法:

    public static void AddOne(int[] nums)
    {
        for (int i = 0; i < nums.Length; i++)
        {
            nums[i]++;
        }
    }

SIMD:

    public static void AddOne<T>(T[] nums) where T : struct, INumber<T>
    {
        int vectorSize = Vector<T>.Count;
        var accVector = Vector<T>.One;
        int i;
        var array = nums;
        for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
        {
            var v = new Vector<T>(array, i);
            var vec = Vector.Add(v, accVector);
            vec.CopyTo(array, i);
        }
        for (; i < array.Length; i++)
        {
            array[i]++;
        }
    }

 

跑分是  int类型 普通方法:390   SIMD:70

double类型  普通方法:578  SIMD:145

 

这对于我们平时普通计算的性能帮助还是有的,官方还用SIMD优化了Matrix的一些类,不过都是很小的二位矩阵,你可以根据自己的需要去设计更复杂的大矩阵运算类,如果需要更复杂的批量多维矩阵处理推荐OpenCvSharp

代码下载:https://wwu.lanzoub.com/iglMD032ky0f

 

参考链接:

https://zhuanlan.zhihu.com/p/60171538

https://habr.com/en/post/467689

https://www.zhihu.com/question/266256257

posted @ 2022-04-12 15:39  咖喱gg  阅读(705)  评论(0编辑  收藏  举报