C# 使用SIMD系列方法加速批量运算
我们现在想做一些简单的批量运算,比如累乘得积,累加求和
public class NormalCalc
{
public static double Multiply(double[] nums)
{
double result = 1.0d;
for (int i = 0; i < nums.Length; i++)
{
result *= nums[i];
}
return result;
}
public static double AddTotal(double[] nums)
{
double result = 0.0d;
for (int i = 0; i < nums.Length; i++)
{
result += nums[i];
}
return result;
}
}
这种批量运算不正是指令集的优势么,那就试试吧
C#中可以使用Vector类来做宽位运算,我这里有avx2指令集,也就是256位,double是64位的,那就有4个,如果做int运算自然就有8个
在这里就是4个4个放到一个Vector里一起做乘法运算,最后把4拷贝到数组中互乘,再把多余的乘完就好了,乘法嘛,用1作为种子
public unsafe static double Multiply(double[] nums)
{
int vectorSize = Vector<double>.Count;
var accVector = Vector<double>.One;
int i;
var array = nums;
double result = 1.0d;
fixed (double* p = array)
{
for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
{
//var v = new Vector<double>(array, i);
var v = Unsafe.Read<Vector<double>>(p + i);
accVector = Vector.Multiply(accVector, v);
}
}
var tempArray = new double[Vector<double>.Count];
accVector.CopyTo(tempArray);
for (int j = 0; j < tempArray.Length; j++)
{
result = result * tempArray[j];
}
for (; i < array.Length; i++)
{
result *= array[i];
}
return result;
}
下一个问题就是,我总不见得又得每种数据类型都写一遍吧,咱有没有办法用C#的各种新特性写成泛型?咱有Span有预览特性INumber,试了下还真可以
用new Vector构造,泛型T用INumber约束就有了T.One来表示数字1,并且能随便的做乘法运算了
public static T Multiply<T>(T[] nums) where T : struct, INumber<T>
{
int vectorSize = Vector<T>.Count;
var accVector = Vector<T>.One;
int i;
var array = nums;
T result = T.One;
for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
{
var v = new Vector<T>(array, i);
accVector = Vector.Multiply(accVector, v);
}
var tempArray = new T[Vector<T>.Count];
accVector.CopyTo(tempArray);
for (int j = 0; j < tempArray.Length; j++)
{
result = result * tempArray[j];
}
for (; i < array.Length; i++)
{
result *= array[i];
}
return result;
}
理论上Span速度不会比指针快,new Vector不会比Unsafe.Read快,但是差不了太多,就能写成泛型方法
来测试一下速度:
//生成运算数组
double[] nums = new double[100000];
Random random = new Random();
for (int i = 0; i < nums.Length; i++)
{
nums[i] = random.NextDouble() * 2.723;
}
//普通连乘
Stopwatch stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
NormalCalc.Multiply(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);
//Vector
stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
SIMD_Calc.Multiply(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);
//Vector+Span+INumber写成泛型
stopwatch = Stopwatch.StartNew();
for (int i = 0; i < 10000; i++)
{
SIMD_Calc.MultiplySpan(nums);
}
stopwatch.Stop();
Console.WriteLine(stopwatch.ElapsedMilliseconds);
结果为:
730
185
190
不错不错,效果还挺满意的
接下来来个泛型的累加
public static T AddTotal<T>(T[] nums) where T : struct, INumber<T>
{
int vectorSize = Vector<T>.Count;
var accVector = Vector<T>.Zero;
int i;
var array = nums;
for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
{
var v = new Vector<T>(array, i);
accVector = Vector.Add(accVector, v);
}
T result = Vector.Dot(accVector, Vector<T>.One);
for (; i < array.Length; i++)
{
result += array[i];
}
return result;
}
还有一种方式不使用Vector,而是直接使用Avx2类下的方法做运算,需要加个是否支持的判断
public unsafe static int AddTotal_Avx2(int[] nums)
{
if (Avx2.IsSupported)
{
int vectorSize = 256 / 8 / 4;
var accVector = Vector256<int>.Zero;
int i;
var array = nums;
fixed (int* ptr = array)
{
for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
{
var v = Avx2.LoadVector256(ptr + i);
accVector = Avx2.Add(accVector, v);
}
}
int result = 0;
var temp = stackalloc int[vectorSize];
Avx2.Store(temp, accVector);
for (int j = 0; j < vectorSize; j++)
{
result += temp[j];
}
for (; i < array.Length; i++)
{
result += array[i];
}
return result;
}
else
{
throw new NotSupportedException();
}
}
但是我们要做好回落,比如没有avx2就用sse,没有sse就用普通的
不过System.Runtime.Intrinsics.X86下面一堆这种,再说了还有arm的,所以通用性不如Vector方法
public unsafe static int AddTotal_2(int[] nums)
{
if (Avx2.IsSupported)
{
return AddTotal_Avx2(nums);
}
else if (Sse2.IsSupported)
{
return AddTotal_Sse2(nums);
}
else
{
return NormalCalc.AddTotal(nums);
}
}
把Vector用在两个数组相加相乘上会更加简单
public unsafe static T[] Multiply<T>(T[] numsl, T[] numsr) where T : struct,INumber<T>
{
if (numsl.Length != numsr.Length)
{
throw new ArgumentException();
}
T[] result = new T[numsl.Length];
int vectorSize = Vector<T>.Count;
int i;
for (i = 0; i <= numsl.Length - vectorSize; i += vectorSize)
{
var l = new Vector<T>(numsl, i);
var r = new Vector<T>(numsr, i);
var multiplied = Vector.Multiply(l, r);
//for (int j = i; j < i + vectorSize; j++)
//{
// result[j] = multiplied[j % vectorSize];
//}
multiplied.CopyTo(result, i);
}
for (; i < numsl.Length; i++)
{
result[i] = numsl[i] * numsr[i];
}
return result;
}
批量加1
普通方法:
public static void AddOne(int[] nums)
{
for (int i = 0; i < nums.Length; i++)
{
nums[i]++;
}
}
SIMD:
public static void AddOne<T>(T[] nums) where T : struct, INumber<T>
{
int vectorSize = Vector<T>.Count;
var accVector = Vector<T>.One;
int i;
var array = nums;
for (i = 0; i <= array.Length - vectorSize; i += vectorSize)
{
var v = new Vector<T>(array, i);
var vec = Vector.Add(v, accVector);
vec.CopyTo(array, i);
}
for (; i < array.Length; i++)
{
array[i]++;
}
}
跑分是 int类型 普通方法:390 SIMD:70
double类型 普通方法:578 SIMD:145
这对于我们平时普通计算的性能帮助还是有的,官方还用SIMD优化了Matrix的一些类,不过都是很小的二位矩阵,你可以根据自己的需要去设计更复杂的大矩阵运算类,如果需要更复杂的批量多维矩阵处理推荐OpenCvSharp
代码下载:https://wwu.lanzoub.com/iglMD032ky0f
参考链接:
https://zhuanlan.zhihu.com/p/60171538
https://habr.com/en/post/467689
https://www.zhihu.com/question/266256257