高性能计算-Intel IPP库ARM移植示例(20)

1. 简介

(1) Intel® Integrated Performance Primitives,即英特尔集成性能基元(简称IPP),为信号、数据和图像处理特定应用领域,提供simd优化的一组全面的函数库。

(2) 本项目将对 exp、cos、sin、tone、Triangle函数用NEON向量化指令实现ARM移植版本,有串行和向量化两个版本。

(3)计算使用泰勒展开,使用前后加和项的比例关系,加速计算。达到接口要求的精度即可。

(4)项目地址:https://github.com/libo-0379/IPP_ARM

2. 函数及接口介绍

(1) exp

Parameters
pSrc Pointer to the source vector.
pDst Pointer to the destination vector.
len Number of elements in the vectors.
IppStatus ippsExp_32f_A24 (const Ipp32f* pSrc, Ipp32f* pDst, Ipp32s len);

(2) cos

Parameters
pSrc Pointer to the source vector.
pDst Pointer to the destination vector.
len Number of elements in the vectors.
IppStatus ippsCos_32f_A24 (const Ipp32f* pSrc, Ipp32f* pDst, Ipp32s len);

(3) sin

Parameters
pSrc Pointer to the source vector.
pDst Pointer to the destination vector.
len Number of elements in the vectors.
IppStatus ippsSin_32f_A24 (const Ipp32f* pSrc, Ipp32f* pDst, Ipp32s len);

(4) Tone

Description:
This function generates the tone with the specified frequency rFreq, phase pPhase, and magnitude magn.
The function computes len samples of the tone, and stores them in the array pDst. For real tones, each
generated value x[n] is defined as:
x[n] = magn * cos(2πn*rFreq + phase)
For complex tones, x[n] is defined as:
x[n] = magn * (cos(2πn*rFreq + phase)+j* sin(2πn*rFreq + phase))
The parameter hint suggests using specific code, which provides for either fast but less accurate calculation,
or more accurate but slower execution.
Parameters:
magn Magnitude of the tone, that is, the maximum value attained by
the wave.
pPhase Pointer to the phase of the tone relative to a cosine wave. It
must be in range [0.0, 2π). You can use the returned value to
compute the next continuous data block.
rFreq Frequency of the tone relative to the sampling frequency. It
must be in the interval [0.0, 0.5) for real tone and in [0.0, 1.0)
for complex tone.
pDst Pointer to the array that stores the samples.
len Number of samples to be computed.
IppStatus ippsTone_32f(Ipp32f* pDst, int len, Ipp32f magn, Ipp32f rFreq, Ipp32f*
pPhase, IppHintAlgorithm hint);

函数表达式

x[n] = magn * cos(2πn*rFreq + phase)

函数图像:

image

(5) Triangle

Description:
This function generates the triangle with the specified frequency rFreq, phase pointed by pPhase, and
magnitude magn. The function computes len samples of the triangle, and stores them in the array pDst. For
real triangle, x[n] is defined as:
x[n] = magn * cth(2π* rFreq*n + phase), n = 0, 1, 2,...
For complex triangles, x[n] is defined as:
x[n] = magn * [cth(2π* rFreq*n + phase) + j * sth(2π* rFreq*n + phase)], n = 0, 1, 2,...
See Triangle-Generating Functions for the definition of functions cth and sth.
The cth () function is determined as follows:
H = π + h
Parameters:
rFreq Frequency of the triangle relative to the sampling frequency. It
must be in range [0.0, 0.5).
pPhase Pointer to the phase of the triangle relative to a cosine
triangular analog wave. It must be in range [0.0, 2π). You can
use the returned value to compute the next continuous data
block.
magn Magnitude of the triangle, that is, the maximum value attained
by the wave.
asym Asymmetry h of a triangle. It must be in range [-π , π ). If h=0,
then the triangle is symmetric and a direct analog of a tone.
pDst Pointer to the array that stores the samples.
len Number of samples to be computed
IppStatus ippsTriangle_32fc(Ipp32fc* pDst, int len, Ipp32f magn, Ipp32f rFreq, Ipp32f
asym, Ipp32f* pPhase);

函数实部和虚部

image

函数图像

image

3. 核心代码

#include <stdio.h>
#include <math.h>
#include "../include/ipp.h"
IppStatus ippsExp_32f_A24(const Ipp32f *pSrc, Ipp32f *dst, Ipp32s len)
{
for(int i=0;i<len/4*4;i+=4)
{
//每个元素泰勒展开项计数,初始化为0
float32x4_t vn = vdupq_n_f32(0);
float32x4_t vx = vld1q_f32(pSrc+i);
//记录当前项的展开值,初始化为第一项的值
float32x4_t vprior = vdupq_n_f32(1.0);
//保存每个元素展开的求和
float32x4_t vsum = vprior;
while(1)
{
//计算当前展开项
// 1. 计算 prior * x / ++n
vn = vaddq_f32(vn,vdupq_n_f32(1));
// 自己实现的 vdivq_f32_neon 对 -20.055 计算误差太大,改用系统提供接口 vdivq_f32
// float32x4_t vcur = vdivq_f32_neon(vmulq_f32(vprior,vx),vn);
float32x4_t vcur = vdivq_f32(vmulq_f32(vprior,vx),vn);
// 2. 将当前项赋值为前项
vprior = vcur;
// printf("0 %f\n",vgetq_lane_f32(vcur,0));
// 3. 如果所有元素的最大展开项小于精度控制,不再计算
if(vmaxvq_f32(vabsq_f32(vcur)) - EPSILON < 0)
break;
// 4. 将当前展开项加和
vsum = vaddq_f32(vsum,vcur);
}
// printf("n %f\n",vgetq_lane_f32(vn,0));
vst1q_f32(dst+i,vsum);
}
//处理尾部数据
for(int i=len/4*4;i<len;i++)
{
// 初始化第一个值
int n = 0;
double prior = 1.0; //前一项数值初始化为第一项的数值
double sum = prior; //求和保存结果
Ipp32f x = *(pSrc+i);
while (1)
{
double cur = prior * x / ++n; //当前加项的数值 = 前项*(src[i]/n)
prior = cur;
// printf("%f\n",cur);
if (fabs(cur)-EPSILON <= 0) //如果小于精度值停止计算
break;
sum += cur;
}
// printf("n %d\n",n);
*(dst+i) = sum;
}
return ippStsNoErr;
}
IppStatus ippsCos_32f_A24(const Ipp32f *pSrc, Ipp32f *dst, Ipp32s len)
{
float32x4_t v2PI = vdupq_n_f32(IPP_2PI);
for(int i=0;i<len/4*4;i+=4)
{
//每个元素泰勒展开项计数,初始化为0
float32x4_t vn = vdupq_n_f32(0);
float32x4_t vx = vld1q_f32(pSrc+i);
//记录当前项的展开值,初始化为第一项的值
float32x4_t vprior = vdupq_n_f32(1.0);
//保存每个元素展开的求和
float32x4_t vsum = vprior;
//将x转换到 [-2π,2π]
//类型转换向下取整,注意这里是浮点数除法
float32x4_t vk = vcvtq_f32_s32(vcvtq_s32_f32(vdivq_f32_neon(vx,v2PI)));
vx = vfmsq_f32(vx,v2PI,vk);
while(1)
{
//计算当前展开项
// 1. 计算 prior * (x^2) / (2(++n)*(2n-1))
vn = vaddq_f32(vn,vdupq_n_f32(1));
float32x4_t vcur = vmulq_n_f32(vn,2);
vcur = vdivq_f32_neon(vmulq_f32(vprior,vmulq_f32(vx,vx)),vmulq_f32(vcur,vsubq_f32(vcur,vdupq_n_f32(1.0))));
//将当前项赋值为前项,不带符号
vprior = vcur;
// 2. 添加正负号
float32_t n = vgetq_lane_f32(vn,0);
vcur = vmulq_n_f32(vcur,(int)n%2==0?1:-1);
// 3. 如果所有元素的展开项绝对值最大项小于精度控制,不再计算
if(vmaxvq_f32(vabsq_f32(vcur)) - EPSILON <= 0)
break;
// 4. 将当前展开项加和
vsum = vaddq_f32(vsum,vcur);
}
printf("n %f\n",vgetq_lane_f32(vn,0));
vst1q_f32(dst+i,vsum);
}
//处理尾部数据
for(int i=len/4*4;i<len;i++)
{
// 初始化第一个值
int n = 0;
double prior = 1.0; //前一项数值初始化为第一项的数值
double sum = prior; //求和保存结果
Ipp64f x = *(pSrc+i);
//对x转换到 [-2π,2π]
Ipp32s k = x/IPP_2PI;
x -= k*IPP_2PI;
while (1)
{
int32_t temp1 = 2* ++n;
double cur = prior * (x * x) /(temp1-- * temp1) ; //当前加项的数值 = 前项*(x^2/(2n*(2n-1)))
prior = cur;
if (fabs(cur) - EPSILON <= 0) //如果小于精度值停止计算
break;
sum += (n%2==0?1:-1)*cur; //符号位
}
// printf("n %d\n",n);
*(dst+i) = sum;
}
return ippStsNoErr;
}
IppStatus ippsSin_32f_A24(const Ipp32f *pSrc, Ipp32f *dst, Ipp32s len)
{
float32x4_t v2PI = vdupq_n_f32(IPP_2PI);
for(int i=0;i<len/4*4;i+=4)
{
//每个元素泰勒展开项计数,初始化为0
float32x4_t vn = vdupq_n_f32(0);
float32x4_t vx = vld1q_f32(pSrc+i);
//将x转换到 [-2π,2π]
//类型转换向下取整,注意这里是浮点数除法
float32x4_t vk = vcvtq_f32_s32(vcvtq_s32_f32(vdivq_f32_neon(vx,v2PI)));
vx = vfmsq_f32(vx,v2PI,vk);
// !! 注意这里易错点,应在区间转换后再赋值给 vprior
//记录当前项的展开值,初始化为第一项的值
float32x4_t vprior = vx;
//保存每个元素展开的求和
float32x4_t vsum = vprior;
printf("x0 %f\n",vgetq_lane_f32(vx,0));
while(1)
{
// 1. 计算 prior * (x^2) / (2(++n)*(2n+1))
vn = vaddq_f32(vn,vdupq_n_f32(1));
float32x4_t vcur = vmulq_n_f32(vn,2);
vcur = vdivq_f32_neon(vmulq_f32(vprior,vmulq_f32(vx,vx)),vmulq_f32(vcur,vaddq_f32(vcur,vdupq_n_f32(1))));
// printf("cur0 %f\n",vgetq_lane_f32(vcur,0));
//将当前项赋值为前项,不带符号
vprior = vcur;
// 2. 添加正负号
float32_t n = vgetq_lane_f32(vn,0);
vcur = vmulq_n_f32(vcur,(int)n%2==0?1:-1);
// 3.如果所有元素的展开项绝对值最大项小于精度控制,不再计算
if(vmaxvq_f32(vabsq_f32(vcur)) - EPSILON <= 0)
break;
// 4.将当前展开项加和
vsum = vaddq_f32(vsum,vcur);
}
printf("n %f\n",vgetq_lane_f32(vn,0));
vst1q_f32(dst+i,vsum);
}
//处理尾部数据
for(int i=len/4*4;i<len;i++)
{
// 初始化第一个值
int n = 0;
Ipp64f x = *(pSrc+i);
//将x转换到 [-2π,2π]
Ipp32s k = x/IPP_2PI;
x -= k*IPP_2PI;
double prior = x; //前一项数值初始化为第一项的数值
double sum = prior; //求和保存结果
while (1)
{
int32_t temp1 = 2* ++n;
double cur = prior * (x * x) /(temp1++ * temp1) ; //当前加项的数值 = 前项*(x^2/(2n*(2n+1)))
prior = cur;
if (fabs(cur) - EPSILON <= 0) //如果小于精度值停止计算
break;
sum += (n%2==0?1:-1)*cur; //符号位
}
// printf("n %d\n",n);
*(dst+i) = sum;
}
return ippStsNoErr;
}
IppStatus ippsTone_32f(Ipp32f* pDst, int len, Ipp32f magn, Ipp32f rFreq,
Ipp32f* pPhase, IppHintAlgorithm hint)
{
if(rFreq<0.0 || (rFreq-0.5)>EPSILON)
return ippStsToneFreqErr;
if(*pPhase<0.0 || (*pPhase-IPP_2PI)>=EPSILON)
return ippStsTonePhaseErr;
if(magn<=0.0)
return ippStsToneMagnErr;
float32x4_t vphase = vdupq_n_f32(*pPhase);
float32x4_t vrFreq= vdupq_n_f32(rFreq);
for(int i=0;i<len/4*4;i+=4)
{
//保存结果
float32_t result[4] = {0.0};
float32x4_t vresult = {0.0};
// IPP_2PI*rFreq*i + phase
float32_t x[4] = {i,i+1,i+2,i+3};
float32x4_t vx = vld1q_f32(x);
vx = vaddq_f32(vphase,vmulq_n_f32(vmulq_f32(vrFreq,vx),IPP_2PI));
vst1q_f32(x,vx);
ippsCos_32f_A24(x,result,4);
vresult = vld1q_f32(result);
vst1q_f32(pDst+i,vmulq_n_f32(vresult,magn));
}
for(int i=len/4*4;i<len;i++)
{
float32_t result = 0.0;
Ipp32f x = IPP_2PI*rFreq*i + (*pPhase);
ippsCos_32f_A24_s(&x,&result,1);
*(pDst+i) = result * magn; //类型强转
}
return ippStsNoErr;
}
IppStatus ippsTriangle_32fc(Ipp32fc* pDst, int len, Ipp32f magn, Ipp32f rFreq,
Ipp32f asym, Ipp32f* pPhase)
{
if(rFreq<0.0 || (rFreq-0.5)>EPSILON)
return ippStsToneFreqErr;
if(*pPhase<0.0 || (*pPhase-IPP_2PI)>=EPSILON)
return ippStsTonePhaseErr;
if(magn<=0.0)
return ippStsToneMagnErr;
float32x4_t vH = vdupq_n_f32(IPP_PI+asym);
printf("h %f\n",IPP_PI+asym);
float32x4_t vPI = vdupq_n_f32(IPP_PI);
float32x4_t v2PI = vdupq_n_f32(IPP_2PI);
float32x4_t v2PI_H = vsubq_f32(v2PI,vH);
float32x4_t vphase = vdupq_n_f32(*pPhase);
float32x4_t vrFreq= vdupq_n_f32(rFreq);
for(int i=0;i<len/4*4;i+=4)
{
float32_t x[4] = {i,i+1,i+2,i+3};
float32x4_t vx = vld1q_f32(x);
// 计算位置 x = 2*PI*rFreq*n + phase
vx = vaddq_f32(vphase,vmulq_n_f32(vmulq_f32(vrFreq,vx),IPP_2PI));
//因为x>0 可以转换到 [0,2π]
//类型转换向下取整,注意这里是浮点数除法
float32x4_t vk = vcvtq_f32_s32(vcvtq_s32_f32(vdivq_f32_neon(vx,v2PI)));
vx = vfmsq_f32(vx,v2PI,vk);
printf("x %f\n",vgetq_lane_f32(vx,0));
vst1q_f32(x,vx);
// for(int i=0;i<4;i++)
// printf("x %f\n",x[i]);
//保存结果包含实部虚部
float32x4x2_t vresult;
//实部
float32x4_t vresultRE;
float32x4_t vresultRE1;
float32x4_t vresultRE2;
//虚部
float32x4_t vresultIM;
float32x4_t vresultIM_1;
float32x4_t vresultIM1;
float32x4_t vresultIM2;
float32x4_t vresultIM3;
//对实部和虚部所有区间都计算最后根据 x 的范围取舍
//计算实部
//[0,H] -2*x/H + 1
vresultRE1 = vaddq_f32(vdupq_n_f32(1.0),vdivq_f32_neon(vmulq_n_f32(vx,-2),vH));
//[H,2π] (2(x-PI)-H)/(2PI-H)
vresultRE2 = vdivq_f32_neon(vsubq_f32(vmulq_n_f32(vsubq_f32(vx,vPI),2),vH),v2PI_H);
//计算虚部
//[0,π-H/2] 2*x/(2PI-H)
vresultIM1 = vdivq_f32_neon(vmulq_n_f32(vx,2),v2PI_H);
//[π-H/2,π+H/2] -2*(x-PI)/H
vresultIM2 = vdivq_f32_neon(vmulq_n_f32(vsubq_f32(vx,vPI),-2),vH);
//[π+H/2,2π] 2(x-2PI)/(2PI-H)
vresultIM3 = vdivq_f32_neon(vmulq_n_f32(vsubq_f32(vx,v2PI),2),v2PI_H);
//实部取舍
uint32x4_t vcompare = vcltq_f32(vx,vH);
printf("c %u\n",vgetq_lane_u32(vcompare,0));
vresultRE = vbslq_f32(vcompare,vresultRE1,vresultRE2);
vresultRE = vmulq_n_f32(vresultRE,magn);
//虚部取舍
float32x4_t v2_H = vdivq_f32_neon(vH,vdupq_n_f32(2));
// x< π-H/2 取 vresultIM1,否则取 vresultIM2
vcompare = vcltq_f32(vx,vsubq_f32(vPI,v2_H));
vresultIM_1 = vbslq_f32(vcompare,vresultIM1,vresultIM2);
// x> π+H/2 取 vresultIM3,否则取 vresultIM
vcompare = vcgtq_f32(vx,vaddq_f32(vPI,v2_H));
vresultIM = vbslq_f32(vcompare,vresultIM3,vresultIM_1);
vresultIM = vmulq_n_f32(vresultIM,magn);
vresult.val[0] = vresultRE;
vresult.val[1] = vresultIM;
printf("re %f\n",vgetq_lane_f32(vresultRE,0));
printf("im %f\n",vgetq_lane_f32(vresultIM,0));
vst2q_f32((float32_t*)(pDst+i),vresult);
}
Ipp32f H = IPP_PI+asym;
for(int i=len/4*4;i<len;i++)
{
Ipp32f x = IPP_2PI*rFreq*i + *pPhase;
//x 需要转换到 [0,2π]
Ipp32s k = x/IPP_2PI;
x -= k*IPP_2PI;
Ipp32fc result;
// printf("x %f\n",x);
//计算实部
if(x>=0 && (H-x)>=EPSILON)
result.re = -2*x/H + 1.0; // -2*x/H + 1
else
result.re = (2*(x-IPP_PI)-H)/(IPP_2PI-H);// (2(x-PI)-H)/(2PI-H)
//计算虚部
if(x>=0.0 && ((IPP_2PI-H)/2)-x >= EPSILON)
result.im = 2*x/(IPP_2PI-H); // 2*x/(2PI-H)
else if(x-(IPP_2PI-H)/2 >=EPSILON && (IPP_2PI+H)/2-x>=EPSILON)
result.im = -2*(x-IPP_PI)/H; //-2*(x-PI)/H
else
result.im = 2*(x-IPP_2PI)/(IPP_2PI-H); //2(x-2PI)/(2PI-H)
result.re *= magn;
result.im *= magn;
*(pDst+i) = result;
}
return ippStsNoErr;
}

测试代码

#include <stdio.h>
#include "../include/ipp.h"
#include "../include/ipp_s.h"
#define N 5
int main()
{
Ipp32f src[N] = {-20.055, -5.456, 7.835, 11.89, 18.76};
Ipp32f dst[N] ={0.0};
for(int i=0;i<N;i++)
{
ippsExp_32f_A24_s(&src[i],&dst[i],1);
printf("result %f\n",dst[i]);
}
printf("======\n");
ippsExp_32f_A24(src,dst,N);
for(int i=0;i<N;i++)
printf("result %f\n",dst[i]);
printf("======\n");
for(int i=0;i<N;i++)
{
ippsCos_32f_A24_s(&src[i],&dst[i],1);
printf("result %f\n",dst[i]);
}
printf("======\n");
ippsCos_32f_A24(src,dst,N);
for(int i=0;i<N;i++)
printf("result %f\n",dst[i]);
printf("======\n");
for(int i=0;i<N;i++)
{
ippsSin_32f_A24_s(&src[i],&dst[i],1);
printf("result %f\n",dst[i]);
}
printf("======\n");
ippsSin_32f_A24(src,dst,N);
for(int i=0;i<N;i++)
printf("result %f\n",dst[i]);
printf("======\n");
Ipp32f phase = 5.3865;
ippsTone_32f_s(dst,N,3,0.2456,&phase,ippAlgHintFast);
for(int i=0;i<N;i++)
printf("result %f\n",dst[i]);
printf("======\n");
ippsTone_32f(dst,N,3,0.2456,&phase,ippAlgHintFast);
for(int i=0;i<N;i++)
printf("result %f\n",dst[i]);
printf("======\n");
Ipp32fc dst1[N];
ippsTriangle_32fc_s(dst1,N,3,0.2456,1.47,&phase);
for(int i=0;i<N;i++)
printf("result %f %f\n",dst1[i].re,dst1[i].im);
printf("======\n");
ippsTriangle_32fc(dst1,N,3,0.2456,1.47,&phase);
for(int i=0;i<N;i++)
printf("result %f %f\n",dst1[i].re,dst1[i].im);
printf("======\n");
return 0;
}

4. Intel IPP库测试

官网安装教程:https://www.intel.cn/content/www/cn/zh/developer/tools/oneapi/hpc-toolkit-download.html?packages=hpc-toolkit&hpc-toolkit-os=linux&hpc-toolkit-lin=offline

运行前设置环境变量

source /opt/intel/oneapi/setvars.sh

编译命令

icx test.c -o test -lippvm -lippcore -lipps

5. 测试数据

函数 串行单精度 FIELD3 串行双精度 FIELD5 向量化单精度 FIELD7 intel IPP FIELD9
ippsExp_32f_A24 1.976372 0 1.976372 0
0.004269 0.00427 0.004269 0.004271
2527.535645 2527.535645 2527.535645 2527.535645
145801.3281 145801.3438 145801.3281 145801.3438
140399216 140399184 140399184 140399184
ippsCos_32f_A24 0.357278 0.357278 0.357279 0.357278
0.676947 0.67695 0.676952 0.67695
0.01898 0.01898 0.018981 0.01898
0.77985 0.77985 0.779854 0.77985
0.995996 0.995993 0.995993 0.995993
ippsSin_32f_A24 -0.933998 -0.933998 -0.933998 -0.933998
0.73603 0.736029 0.73603 0.736029
0.99982 0.99982 0.99982 0.99982
-0.625968 -0.625967 -0.625965 -0.625966
-0.089438 -0.089436 -0.089436 -0.089436
ippsTone_32f 1.872611 1.872609 1.872612 1.872609
2.394652 2.394654 2.394655 2.394655
-1.740218 -1.74022 -1.74022 -1.74022
-2.490863 -2.490862 -2.490861 -2.490863
1.602509 1.602513 1.602509 1.602513
ippsTriangle_32fc 实部 虚部 实部 虚部 实部 虚部 实部 虚部
-0.218555 -2.920779 -0.218555 -2.920779 -0.218555 -2.920779
2.158905 2.320416 2.158905 2.320415 2.158905 2.320415
0.15116 1.238589 0.15116 1.238589 0.15116 1.238588
-1.856586 -0.769157 -1.856585 -0.769157 -1.856585 -0.769157
-0.615485 -2.776901 -0.615485 -2.776901 -0.615484 -2.776901

6. 结果分析

(1) 用单精度计算的结果可能误差较大,无法保证接口A24要求单精度十进制6位数的准确,建议使用双精度计算转换成单精度结果;

(2) NEON 提供有 vdiv 除法接口,比使用倒数的自定义版本精度更高。

posted @   安洛8  阅读(62)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂
点击右上角即可分享
微信分享提示