1. 简介
(2) 本项目将对 exp、cos、sin、tone、Triangle函数用NEON向量化指令实现ARM移植版本,有串行和向量化两个版本。
(3)计算使用泰勒展开,使用前后加和项的比例关系,加速计算。达到接口要求的精度即可。
2. 函数及接口介绍
(1) exp
| Parameters |
| pSrc Pointer to the source vector. |
| pDst Pointer to the destination vector. |
| len Number of elements in the vectors. |
| |
| IppStatus ippsExp_32f_A24 (const Ipp32f* pSrc, Ipp32f* pDst, Ipp32s len); |
(2) cos
| Parameters |
| pSrc Pointer to the source vector. |
| pDst Pointer to the destination vector. |
| len Number of elements in the vectors. |
| |
| IppStatus ippsCos_32f_A24 (const Ipp32f* pSrc, Ipp32f* pDst, Ipp32s len); |
(3) sin
| Parameters |
| pSrc Pointer to the source vector. |
| pDst Pointer to the destination vector. |
| len Number of elements in the vectors. |
| |
| IppStatus ippsSin_32f_A24 (const Ipp32f* pSrc, Ipp32f* pDst, Ipp32s len); |
(4) Tone
| Description: |
| This function generates the tone with the specified frequency rFreq, phase pPhase, and magnitude magn. |
| The function computes len samples of the tone, and stores them in the array pDst. For real tones, each |
| generated value x[n] is defined as: |
| x[n] = magn * cos(2πn*rFreq + phase) |
| For complex tones, x[n] is defined as: |
| x[n] = magn * (cos(2πn*rFreq + phase)+j* sin(2πn*rFreq + phase)) |
| The parameter hint suggests using specific code, which provides for either fast but less accurate calculation, |
| or more accurate but slower execution. |
| |
| Parameters: |
| magn Magnitude of the tone, that is, the maximum value attained by |
| the wave. |
| pPhase Pointer to the phase of the tone relative to a cosine wave. It |
| must be in range [0.0, 2π). You can use the returned value to |
| compute the next continuous data block. |
| rFreq Frequency of the tone relative to the sampling frequency. It |
| must be in the interval [0.0, 0.5) for real tone and in [0.0, 1.0) |
| for complex tone. |
| pDst Pointer to the array that stores the samples. |
| len Number of samples to be computed. |
| |
| IppStatus ippsTone_32f(Ipp32f* pDst, int len, Ipp32f magn, Ipp32f rFreq, Ipp32f* |
| pPhase, IppHintAlgorithm hint); |
函数表达式
x[n] = magn * cos(2πn*rFreq + phase)
函数图像:

(5) Triangle
| Description: |
| This function generates the triangle with the specified frequency rFreq, phase pointed by pPhase, and |
| magnitude magn. The function computes len samples of the triangle, and stores them in the array pDst. For |
| real triangle, x[n] is defined as: |
| x[n] = magn * cth(2π* rFreq*n + phase), n = 0, 1, 2,... |
| For complex triangles, x[n] is defined as: |
| x[n] = magn * [cth(2π* rFreq*n + phase) + j * sth(2π* rFreq*n + phase)], n = 0, 1, 2,... |
| See Triangle-Generating Functions for the definition of functions cth and sth. |
| The cth () function is determined as follows: |
| H = π + h |
| |
| Parameters: |
| rFreq Frequency of the triangle relative to the sampling frequency. It |
| must be in range [0.0, 0.5). |
| pPhase Pointer to the phase of the triangle relative to a cosine |
| triangular analog wave. It must be in range [0.0, 2π). You can |
| use the returned value to compute the next continuous data |
| block. |
| magn Magnitude of the triangle, that is, the maximum value attained |
| by the wave. |
| asym Asymmetry h of a triangle. It must be in range [-π , π ). If h=0, |
| then the triangle is symmetric and a direct analog of a tone. |
| pDst Pointer to the array that stores the samples. |
| len Number of samples to be computed |
| |
| IppStatus ippsTriangle_32fc(Ipp32fc* pDst, int len, Ipp32f magn, Ipp32f rFreq, Ipp32f |
| asym, Ipp32f* pPhase); |
函数实部和虚部

函数图像

3. 核心代码
| #include <stdio.h> |
| #include <math.h> |
| #include "../include/ipp.h" |
| |
| IppStatus ippsExp_32f_A24(const Ipp32f *pSrc, Ipp32f *dst, Ipp32s len) |
| { |
| for(int i=0;i<len/4*4;i+=4) |
| { |
| |
| float32x4_t vn = vdupq_n_f32(0); |
| float32x4_t vx = vld1q_f32(pSrc+i); |
| |
| float32x4_t vprior = vdupq_n_f32(1.0); |
| |
| float32x4_t vsum = vprior; |
| while(1) |
| { |
| |
| |
| vn = vaddq_f32(vn,vdupq_n_f32(1)); |
| |
| |
| float32x4_t vcur = vdivq_f32(vmulq_f32(vprior,vx),vn); |
| |
| vprior = vcur; |
| |
| |
| if(vmaxvq_f32(vabsq_f32(vcur)) - EPSILON < 0) |
| break; |
| |
| vsum = vaddq_f32(vsum,vcur); |
| } |
| |
| vst1q_f32(dst+i,vsum); |
| } |
| |
| |
| for(int i=len/4*4;i<len;i++) |
| { |
| |
| int n = 0; |
| double prior = 1.0; |
| double sum = prior; |
| Ipp32f x = *(pSrc+i); |
| while (1) |
| { |
| double cur = prior * x / ++n; |
| prior = cur; |
| |
| if (fabs(cur)-EPSILON <= 0) |
| break; |
| sum += cur; |
| } |
| |
| *(dst+i) = sum; |
| } |
| return ippStsNoErr; |
| } |
| |
| IppStatus ippsCos_32f_A24(const Ipp32f *pSrc, Ipp32f *dst, Ipp32s len) |
| { |
| float32x4_t v2PI = vdupq_n_f32(IPP_2PI); |
| for(int i=0;i<len/4*4;i+=4) |
| { |
| |
| float32x4_t vn = vdupq_n_f32(0); |
| float32x4_t vx = vld1q_f32(pSrc+i); |
| |
| float32x4_t vprior = vdupq_n_f32(1.0); |
| |
| float32x4_t vsum = vprior; |
| |
| |
| float32x4_t vk = vcvtq_f32_s32(vcvtq_s32_f32(vdivq_f32_neon(vx,v2PI))); |
| vx = vfmsq_f32(vx,v2PI,vk); |
| while(1) |
| { |
| |
| |
| vn = vaddq_f32(vn,vdupq_n_f32(1)); |
| float32x4_t vcur = vmulq_n_f32(vn,2); |
| vcur = vdivq_f32_neon(vmulq_f32(vprior,vmulq_f32(vx,vx)),vmulq_f32(vcur,vsubq_f32(vcur,vdupq_n_f32(1.0)))); |
| |
| vprior = vcur; |
| |
| float32_t n = vgetq_lane_f32(vn,0); |
| vcur = vmulq_n_f32(vcur,(int)n%2==0?1:-1); |
| |
| if(vmaxvq_f32(vabsq_f32(vcur)) - EPSILON <= 0) |
| break; |
| |
| vsum = vaddq_f32(vsum,vcur); |
| } |
| printf("n %f\n",vgetq_lane_f32(vn,0)); |
| vst1q_f32(dst+i,vsum); |
| } |
| |
| for(int i=len/4*4;i<len;i++) |
| { |
| |
| int n = 0; |
| double prior = 1.0; |
| double sum = prior; |
| Ipp64f x = *(pSrc+i); |
| |
| Ipp32s k = x/IPP_2PI; |
| x -= k*IPP_2PI; |
| while (1) |
| { |
| int32_t temp1 = 2* ++n; |
| double cur = prior * (x * x) /(temp1-- * temp1) ; |
| prior = cur; |
| if (fabs(cur) - EPSILON <= 0) |
| break; |
| sum += (n%2==0?1:-1)*cur; |
| } |
| |
| *(dst+i) = sum; |
| } |
| return ippStsNoErr; |
| } |
| |
| IppStatus ippsSin_32f_A24(const Ipp32f *pSrc, Ipp32f *dst, Ipp32s len) |
| { |
| float32x4_t v2PI = vdupq_n_f32(IPP_2PI); |
| for(int i=0;i<len/4*4;i+=4) |
| { |
| |
| float32x4_t vn = vdupq_n_f32(0); |
| float32x4_t vx = vld1q_f32(pSrc+i); |
| |
| |
| float32x4_t vk = vcvtq_f32_s32(vcvtq_s32_f32(vdivq_f32_neon(vx,v2PI))); |
| vx = vfmsq_f32(vx,v2PI,vk); |
| |
| |
| float32x4_t vprior = vx; |
| |
| float32x4_t vsum = vprior; |
| printf("x0 %f\n",vgetq_lane_f32(vx,0)); |
| while(1) |
| { |
| |
| vn = vaddq_f32(vn,vdupq_n_f32(1)); |
| float32x4_t vcur = vmulq_n_f32(vn,2); |
| vcur = vdivq_f32_neon(vmulq_f32(vprior,vmulq_f32(vx,vx)),vmulq_f32(vcur,vaddq_f32(vcur,vdupq_n_f32(1)))); |
| |
| |
| vprior = vcur; |
| |
| float32_t n = vgetq_lane_f32(vn,0); |
| vcur = vmulq_n_f32(vcur,(int)n%2==0?1:-1); |
| |
| if(vmaxvq_f32(vabsq_f32(vcur)) - EPSILON <= 0) |
| break; |
| |
| vsum = vaddq_f32(vsum,vcur); |
| } |
| printf("n %f\n",vgetq_lane_f32(vn,0)); |
| vst1q_f32(dst+i,vsum); |
| } |
| |
| for(int i=len/4*4;i<len;i++) |
| { |
| |
| int n = 0; |
| Ipp64f x = *(pSrc+i); |
| |
| Ipp32s k = x/IPP_2PI; |
| x -= k*IPP_2PI; |
| double prior = x; |
| double sum = prior; |
| while (1) |
| { |
| int32_t temp1 = 2* ++n; |
| double cur = prior * (x * x) /(temp1++ * temp1) ; |
| prior = cur; |
| if (fabs(cur) - EPSILON <= 0) |
| break; |
| sum += (n%2==0?1:-1)*cur; |
| } |
| |
| *(dst+i) = sum; |
| } |
| return ippStsNoErr; |
| } |
| |
| IppStatus ippsTone_32f(Ipp32f* pDst, int len, Ipp32f magn, Ipp32f rFreq, |
| Ipp32f* pPhase, IppHintAlgorithm hint) |
| { |
| if(rFreq<0.0 || (rFreq-0.5)>EPSILON) |
| return ippStsToneFreqErr; |
| if(*pPhase<0.0 || (*pPhase-IPP_2PI)>=EPSILON) |
| return ippStsTonePhaseErr; |
| if(magn<=0.0) |
| return ippStsToneMagnErr; |
| |
| float32x4_t vphase = vdupq_n_f32(*pPhase); |
| float32x4_t vrFreq= vdupq_n_f32(rFreq); |
| for(int i=0;i<len/4*4;i+=4) |
| { |
| |
| float32_t result[4] = {0.0}; |
| float32x4_t vresult = {0.0}; |
| |
| float32_t x[4] = {i,i+1,i+2,i+3}; |
| float32x4_t vx = vld1q_f32(x); |
| vx = vaddq_f32(vphase,vmulq_n_f32(vmulq_f32(vrFreq,vx),IPP_2PI)); |
| vst1q_f32(x,vx); |
| ippsCos_32f_A24(x,result,4); |
| vresult = vld1q_f32(result); |
| vst1q_f32(pDst+i,vmulq_n_f32(vresult,magn)); |
| } |
| for(int i=len/4*4;i<len;i++) |
| { |
| float32_t result = 0.0; |
| Ipp32f x = IPP_2PI*rFreq*i + (*pPhase); |
| ippsCos_32f_A24_s(&x,&result,1); |
| *(pDst+i) = result * magn; |
| } |
| return ippStsNoErr; |
| } |
| |
| IppStatus ippsTriangle_32fc(Ipp32fc* pDst, int len, Ipp32f magn, Ipp32f rFreq, |
| Ipp32f asym, Ipp32f* pPhase) |
| { |
| if(rFreq<0.0 || (rFreq-0.5)>EPSILON) |
| return ippStsToneFreqErr; |
| if(*pPhase<0.0 || (*pPhase-IPP_2PI)>=EPSILON) |
| return ippStsTonePhaseErr; |
| if(magn<=0.0) |
| return ippStsToneMagnErr; |
| |
| float32x4_t vH = vdupq_n_f32(IPP_PI+asym); |
| printf("h %f\n",IPP_PI+asym); |
| |
| float32x4_t vPI = vdupq_n_f32(IPP_PI); |
| float32x4_t v2PI = vdupq_n_f32(IPP_2PI); |
| float32x4_t v2PI_H = vsubq_f32(v2PI,vH); |
| float32x4_t vphase = vdupq_n_f32(*pPhase); |
| float32x4_t vrFreq= vdupq_n_f32(rFreq); |
| for(int i=0;i<len/4*4;i+=4) |
| { |
| float32_t x[4] = {i,i+1,i+2,i+3}; |
| float32x4_t vx = vld1q_f32(x); |
| |
| vx = vaddq_f32(vphase,vmulq_n_f32(vmulq_f32(vrFreq,vx),IPP_2PI)); |
| |
| |
| float32x4_t vk = vcvtq_f32_s32(vcvtq_s32_f32(vdivq_f32_neon(vx,v2PI))); |
| vx = vfmsq_f32(vx,v2PI,vk); |
| |
| printf("x %f\n",vgetq_lane_f32(vx,0)); |
| |
| vst1q_f32(x,vx); |
| |
| |
| |
| float32x4x2_t vresult; |
| |
| float32x4_t vresultRE; |
| float32x4_t vresultRE1; |
| float32x4_t vresultRE2; |
| |
| float32x4_t vresultIM; |
| float32x4_t vresultIM_1; |
| float32x4_t vresultIM1; |
| float32x4_t vresultIM2; |
| float32x4_t vresultIM3; |
| |
| |
| |
| vresultRE1 = vaddq_f32(vdupq_n_f32(1.0),vdivq_f32_neon(vmulq_n_f32(vx,-2),vH)); |
| |
| vresultRE2 = vdivq_f32_neon(vsubq_f32(vmulq_n_f32(vsubq_f32(vx,vPI),2),vH),v2PI_H); |
| |
| |
| |
| vresultIM1 = vdivq_f32_neon(vmulq_n_f32(vx,2),v2PI_H); |
| |
| vresultIM2 = vdivq_f32_neon(vmulq_n_f32(vsubq_f32(vx,vPI),-2),vH); |
| |
| vresultIM3 = vdivq_f32_neon(vmulq_n_f32(vsubq_f32(vx,v2PI),2),v2PI_H); |
| |
| |
| uint32x4_t vcompare = vcltq_f32(vx,vH); |
| |
| printf("c %u\n",vgetq_lane_u32(vcompare,0)); |
| |
| vresultRE = vbslq_f32(vcompare,vresultRE1,vresultRE2); |
| vresultRE = vmulq_n_f32(vresultRE,magn); |
| |
| float32x4_t v2_H = vdivq_f32_neon(vH,vdupq_n_f32(2)); |
| |
| vcompare = vcltq_f32(vx,vsubq_f32(vPI,v2_H)); |
| vresultIM_1 = vbslq_f32(vcompare,vresultIM1,vresultIM2); |
| |
| vcompare = vcgtq_f32(vx,vaddq_f32(vPI,v2_H)); |
| vresultIM = vbslq_f32(vcompare,vresultIM3,vresultIM_1); |
| vresultIM = vmulq_n_f32(vresultIM,magn); |
| |
| vresult.val[0] = vresultRE; |
| vresult.val[1] = vresultIM; |
| printf("re %f\n",vgetq_lane_f32(vresultRE,0)); |
| printf("im %f\n",vgetq_lane_f32(vresultIM,0)); |
| vst2q_f32((float32_t*)(pDst+i),vresult); |
| } |
| Ipp32f H = IPP_PI+asym; |
| for(int i=len/4*4;i<len;i++) |
| { |
| Ipp32f x = IPP_2PI*rFreq*i + *pPhase; |
| |
| Ipp32s k = x/IPP_2PI; |
| x -= k*IPP_2PI; |
| Ipp32fc result; |
| |
| |
| if(x>=0 && (H-x)>=EPSILON) |
| result.re = -2*x/H + 1.0; |
| else |
| result.re = (2*(x-IPP_PI)-H)/(IPP_2PI-H); |
| |
| if(x>=0.0 && ((IPP_2PI-H)/2)-x >= EPSILON) |
| result.im = 2*x/(IPP_2PI-H); |
| else if(x-(IPP_2PI-H)/2 >=EPSILON && (IPP_2PI+H)/2-x>=EPSILON) |
| result.im = -2*(x-IPP_PI)/H; |
| else |
| result.im = 2*(x-IPP_2PI)/(IPP_2PI-H); |
| result.re *= magn; |
| result.im *= magn; |
| *(pDst+i) = result; |
| } |
| |
| return ippStsNoErr; |
| } |
测试代码
| |
| |
| |
| |
| |
| |
| int main() |
| { |
| Ipp32f src[N] = {-20.055, -5.456, 7.835, 11.89, 18.76}; |
| Ipp32f dst[N] ={0.0}; |
| for(int i=0;i<N;i++) |
| { |
| ippsExp_32f_A24_s(&src[i],&dst[i],1); |
| printf("result %f\n",dst[i]); |
| } |
| printf("======\n"); |
| ippsExp_32f_A24(src,dst,N); |
| for(int i=0;i<N;i++) |
| printf("result %f\n",dst[i]); |
| printf("======\n"); |
| |
| for(int i=0;i<N;i++) |
| { |
| ippsCos_32f_A24_s(&src[i],&dst[i],1); |
| printf("result %f\n",dst[i]); |
| } |
| printf("======\n"); |
| ippsCos_32f_A24(src,dst,N); |
| for(int i=0;i<N;i++) |
| printf("result %f\n",dst[i]); |
| printf("======\n"); |
| |
| for(int i=0;i<N;i++) |
| { |
| ippsSin_32f_A24_s(&src[i],&dst[i],1); |
| printf("result %f\n",dst[i]); |
| } |
| printf("======\n"); |
| ippsSin_32f_A24(src,dst,N); |
| for(int i=0;i<N;i++) |
| printf("result %f\n",dst[i]); |
| printf("======\n"); |
| |
| Ipp32f phase = 5.3865; |
| ippsTone_32f_s(dst,N,3,0.2456,&phase,ippAlgHintFast); |
| for(int i=0;i<N;i++) |
| printf("result %f\n",dst[i]); |
| printf("======\n"); |
| ippsTone_32f(dst,N,3,0.2456,&phase,ippAlgHintFast); |
| for(int i=0;i<N;i++) |
| printf("result %f\n",dst[i]); |
| printf("======\n"); |
| |
| Ipp32fc dst1[N]; |
| ippsTriangle_32fc_s(dst1,N,3,0.2456,1.47,&phase); |
| for(int i=0;i<N;i++) |
| printf("result %f %f\n",dst1[i].re,dst1[i].im); |
| printf("======\n"); |
| |
| ippsTriangle_32fc(dst1,N,3,0.2456,1.47,&phase); |
| for(int i=0;i<N;i++) |
| printf("result %f %f\n",dst1[i].re,dst1[i].im); |
| printf("======\n"); |
| |
| return 0; |
| } |
4. Intel IPP库测试
运行前设置环境变量
| source /opt/intel/oneapi/setvars.sh |
编译命令
| icx test.c -o test -lippvm -lippcore -lipps |
5. 测试数据
函数 |
串行单精度 |
FIELD3 |
串行双精度 |
FIELD5 |
向量化单精度 |
FIELD7 |
intel IPP |
FIELD9 |
ippsExp_32f_A24 |
1.976372 |
|
0 |
|
1.976372 |
|
0 |
|
|
0.004269 |
|
0.00427 |
|
0.004269 |
|
0.004271 |
|
|
2527.535645 |
|
2527.535645 |
|
2527.535645 |
|
2527.535645 |
|
|
145801.3281 |
|
145801.3438 |
|
145801.3281 |
|
145801.3438 |
|
|
140399216 |
|
140399184 |
|
140399184 |
|
140399184 |
|
ippsCos_32f_A24 |
0.357278 |
|
0.357278 |
|
0.357279 |
|
0.357278 |
|
|
0.676947 |
|
0.67695 |
|
0.676952 |
|
0.67695 |
|
|
0.01898 |
|
0.01898 |
|
0.018981 |
|
0.01898 |
|
|
0.77985 |
|
0.77985 |
|
0.779854 |
|
0.77985 |
|
|
0.995996 |
|
0.995993 |
|
0.995993 |
|
0.995993 |
|
ippsSin_32f_A24 |
-0.933998 |
|
-0.933998 |
|
-0.933998 |
|
-0.933998 |
|
|
0.73603 |
|
0.736029 |
|
0.73603 |
|
0.736029 |
|
|
0.99982 |
|
0.99982 |
|
0.99982 |
|
0.99982 |
|
|
-0.625968 |
|
-0.625967 |
|
-0.625965 |
|
-0.625966 |
|
|
-0.089438 |
|
-0.089436 |
|
-0.089436 |
|
-0.089436 |
|
ippsTone_32f |
1.872611 |
|
1.872609 |
|
1.872612 |
|
1.872609 |
|
|
2.394652 |
|
2.394654 |
|
2.394655 |
|
2.394655 |
|
|
-1.740218 |
|
-1.74022 |
|
-1.74022 |
|
-1.74022 |
|
|
-2.490863 |
|
-2.490862 |
|
-2.490861 |
|
-2.490863 |
|
|
1.602509 |
|
1.602513 |
|
1.602509 |
|
1.602513 |
|
ippsTriangle_32fc |
实部 |
虚部 |
实部 |
虚部 |
实部 |
虚部 |
实部 |
虚部 |
|
-0.218555 |
-2.920779 |
-0.218555 |
-2.920779 |
|
|
-0.218555 |
-2.920779 |
|
2.158905 |
2.320416 |
2.158905 |
2.320415 |
|
|
2.158905 |
2.320415 |
|
0.15116 |
1.238589 |
0.15116 |
1.238589 |
|
|
0.15116 |
1.238588 |
|
-1.856586 |
-0.769157 |
-1.856585 |
-0.769157 |
|
|
-1.856585 |
-0.769157 |
|
-0.615485 |
-2.776901 |
-0.615485 |
-2.776901 |
|
|
-0.615484 |
-2.776901 |
6. 结果分析
(1) 用单精度计算的结果可能误差较大,无法保证接口A24要求单精度十进制6位数的准确,建议使用双精度计算转换成单精度结果;
(2) NEON 提供有 vdiv 除法接口,比使用倒数的自定义版本精度更高。
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· DeepSeek R1 简明指南:架构、训练、本地部署及硬件要求
· NetPad:一个.NET开源、跨平台的C#编辑器
· PowerShell开发游戏 · 打蜜蜂