tm1500 SAD 优化算法,应用于MPEG4视频编解码算法中。
#ifndef _ENCODER_SAD_H_
#define _ENCODER_SAD_H_
#define SCALETWOBIT 2
#define SCALEONEBIT 1
#define ZERO 0
#define SAD16_1() \
s0 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
s1 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
s2 = FUNSHIFT3((refptr)[3], (refptr)[2]); \
s3 = FUNSHIFT3((refptr)[4], (refptr)[3]); \
tmp0 = UME8UU((curptr)[0], s0); \
tmp1 = UME8UU((curptr)[1], s1); \
tmp2 = UME8UU((curptr)[2], s2); \
tmp3 = UME8UU((curptr)[3], s3); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ; \
curptr += stride_scale_4; \
refptr += stride_scale_4;
#define SAD16_1s() \
s0 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
s1 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
s2 = FUNSHIFT3((refptr)[3], (refptr)[2]); \
s3 = FUNSHIFT3((refptr)[4], (refptr)[3]); \
tmp0 = UME8UU((curptr)[0], s0); \
tmp1 = UME8UU((curptr)[1], s1); \
tmp2 = UME8UU((curptr)[2], s2); \
tmp3 = UME8UU((curptr)[3], s3); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;
#define SAD16_2() \
s0 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
s1 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
s2 = FUNSHIFT2((refptr)[3], (refptr)[2]); \
s3 = FUNSHIFT2((refptr)[4], (refptr)[3]); \
tmp0 = UME8UU((curptr)[0], s0); \
tmp1 = UME8UU((curptr)[1], s1); \
tmp2 = UME8UU((curptr)[2], s2); \
tmp3 = UME8UU((curptr)[3], s3); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ; \
curptr += stride_scale_4; \
refptr += stride_scale_4;
#define SAD16_2s() \
s0 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
s1 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
s2 = FUNSHIFT2((refptr)[3], (refptr)[2]); \
s3 = FUNSHIFT2((refptr)[4], (refptr)[3]); \
tmp0 = UME8UU((curptr)[0], s0); \
tmp1 = UME8UU((curptr)[1], s1); \
tmp2 = UME8UU((curptr)[2], s2); \
tmp3 = UME8UU((curptr)[3], s3); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;
#define SAD16_3() \
s0 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
s1 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
s2 = FUNSHIFT1((refptr)[3], (refptr)[2]); \
s3 = FUNSHIFT1((refptr)[4], (refptr)[3]); \
tmp0 = UME8UU((curptr)[0], s0); \
tmp1 = UME8UU((curptr)[1], s1); \
tmp2 = UME8UU((curptr)[2], s2); \
tmp3 = UME8UU((curptr)[3], s3); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ; \
curptr += stride_scale_4; \
refptr += stride_scale_4;
#define SAD16_3s() \
s0 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
s1 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
s2 = FUNSHIFT1((refptr)[3], (refptr)[2]); \
s3 = FUNSHIFT1((refptr)[4], (refptr)[3]); \
tmp0 = UME8UU((curptr)[0], s0); \
tmp1 = UME8UU((curptr)[1], s1); \
tmp2 = UME8UU((curptr)[2], s2); \
tmp3 = UME8UU((curptr)[3], s3); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;
#define SAD16() \
tmp0 = UME8UU((curptr)[0], (refptr)[0]); \
tmp1 = UME8UU((curptr)[1], (refptr)[1]); \
tmp2 = UME8UU((curptr)[2], (refptr)[2]); \
tmp3 = UME8UU((curptr)[3], (refptr)[3]); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ; \
curptr += stride_scale_4; \
refptr += stride_scale_4;
#define SAD16s() \
tmp0 = UME8UU((curptr)[0], (refptr)[0]); \
tmp1 = UME8UU((curptr)[1], (refptr)[1]); \
tmp2 = UME8UU((curptr)[2], (refptr)[2]); \
tmp3 = UME8UU((curptr)[3], (refptr)[3]); \
sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;
static __inline int sad16_f(const uint8_t * const cur, const uint8_t * const ref, const int32_t stride)
{
#pragma TCS_no_caller_save
int s0, s1, s2, s3;
int tmp0, tmp1, tmp2, tmp3;
int sad = 0;
const int stride_scale_4 = stride>>2 ;
const int ref1 = (int) ref & ~3;
const int ref2 = (int) ref & 3;
long *restrict curptr;
long *restrict refptr;
curptr = (long*)cur;
refptr = (long*)ref1;
switch(ref2)
{
case 0:
SAD16(); SAD16(); SAD16(); SAD16();
SAD16(); SAD16(); SAD16(); SAD16();
SAD16(); SAD16(); SAD16(); SAD16();
SAD16(); SAD16(); SAD16(); SAD16s();
return sad;
case 1:
SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1();
SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1();
SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1();
SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1s();
return sad;
case 2:
SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2();
SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2();
SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2();
SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2s();
return sad;
default:
SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3();
SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3();
SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3();
SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3s();
return sad;
}
/*
int32_t j;
int32_t sad = 0;
uint8_t const *ptr_cur = cur;
uint8_t const *ptr_ref = ref;
for (j = 0; j < 16; j++)
{
sad += abs(ptr_cur[0] - ptr_ref[0]);
sad += abs(ptr_cur[1] - ptr_ref[1]);
sad += abs(ptr_cur[2] - ptr_ref[2]);
sad += abs(ptr_cur[3] - ptr_ref[3]);
sad += abs(ptr_cur[4] - ptr_ref[4]);
sad += abs(ptr_cur[5] - ptr_ref[5]);
sad += abs(ptr_cur[6] - ptr_ref[6]);
sad += abs(ptr_cur[7] - ptr_ref[7]);
sad += abs(ptr_cur[8] - ptr_ref[8]);
sad += abs(ptr_cur[9] - ptr_ref[9]);
sad += abs(ptr_cur[10] - ptr_ref[10]);
sad += abs(ptr_cur[11] - ptr_ref[11]);
sad += abs(ptr_cur[12] - ptr_ref[12]);
sad += abs(ptr_cur[13] - ptr_ref[13]);
sad += abs(ptr_cur[14] - ptr_ref[14]);
sad += abs(ptr_cur[15] - ptr_ref[15]);
// if (sad >= best_sad)
// return sad;
ptr_cur += stride;
ptr_ref += stride;
}
return sad;
// */
}
#define SAD8() \
tmp0 = (refptr)[0]; \
tmp1 = (refptr)[1]; \
sad += UME8UU((curptr)[0], tmp0) + UME8UU((curptr)[1], tmp1); \
curptr += stride_scale_4; \
refptr += stride_scale_4;
#define SAD8s() \
tmp0 = (refptr)[0]; \
tmp1 = (refptr)[1]; \
sad += UME8UU((curptr)[0], tmp0) + UME8UU((curptr)[1], tmp1);
static __inline int sad8_f(const uint8_t * const cur, const uint8_t * const ref, const int32_t stride)
{
#pragma TCS_no_caller_save
int tmp0, tmp1;
int sad = 0;
const int stride_scale_4 = stride>>2;
long *restrict curptr;
long *restrict refptr;
curptr = (long*)cur;
refptr = (long*)ref;
SAD8(); SAD8(); SAD8(); SAD8();
SAD8(); SAD8(); SAD8(); SAD8s();
return sad;
/*
int32_t j;
int32_t sad = 0;
uint8_t const *ptr_cur = cur;
uint8_t const *ptr_ref = ref;
for (j = 0; j < 8; j++)
{
sad += abs(ptr_cur[0] - ptr_ref[0]);
sad += abs(ptr_cur[1] - ptr_ref[1]);
sad += abs(ptr_cur[2] - ptr_ref[2]);
sad += abs(ptr_cur[3] - ptr_ref[3]);
sad += abs(ptr_cur[4] - ptr_ref[4]);
sad += abs(ptr_cur[5] - ptr_ref[5]);
sad += abs(ptr_cur[6] - ptr_ref[6]);
sad += abs(ptr_cur[7] - ptr_ref[7]);
ptr_cur += stride;
ptr_ref += stride;
}
return sad;
// */
}
#define SAD16V() \
tmp0 = UME8UU((curptr0)[0], (refptr0)[0]); \
tmp1 = UME8UU((curptr0)[1], (refptr0)[1]); \
sad0 += (tmp0 + tmp1); \
tmp2 = UME8UU((curptr0)[2], (refptr0)[2]); \
tmp3 = UME8UU((curptr0)[3], (refptr0)[3]); \
sad1 += (tmp2 + tmp3); \
tmp4 = UME8UU((curptr2)[0], (refptr2)[0]); \
tmp5 = UME8UU((curptr2)[1], (refptr2)[1]); \
sad2 += (tmp4 + tmp5); \
tmp6 = UME8UU((curptr2)[2], (refptr2)[2]); \
tmp7 = UME8UU((curptr2)[3], (refptr2)[3]); \
sad3 += (tmp6 + tmp7); \
curptr0 += stride_scale_4; \
refptr0 += stride_scale_4; \
curptr2 += stride_scale_4; \
refptr2 += stride_scale_4;
#define SAD16Vs() \
tmp0 = UME8UU((curptr0)[0], (refptr0)[0]); \
tmp1 = UME8UU((curptr0)[1], (refptr0)[1]); \
sad0 += (tmp0 + tmp1); \
tmp2 = UME8UU((curptr0)[2], (refptr0)[2]); \
tmp3 = UME8UU((curptr0)[3], (refptr0)[3]); \
sad1 += (tmp2 + tmp3); \
tmp4 = UME8UU((curptr2)[0], (refptr2)[0]); \
tmp5 = UME8UU((curptr2)[1], (refptr2)[1]); \
sad2 += (tmp4 + tmp5); \
tmp6 = UME8UU((curptr2)[2], (refptr2)[2]); \
tmp7 = UME8UU((curptr2)[3], (refptr2)[3]); \
sad3 += (tmp6 + tmp7);
static __inline int sad16v_f(const uint8_t * const cur, const uint8_t * const ref, const int32_t stride, int *sad)
{
#pragma TCS_no_caller_save
int tmp0, tmp1, tmp2, tmp3;
int tmp4, tmp5, tmp6, tmp7;
int sad0 = 0;
int sad1 = 0;
int sad2 = 0;
int sad3 = 0;
const int stride_scale_4 = stride >> 2; // ASRI(SCALETWOBIT,stride); // convert 1 byte stride to 4 byte stride ;
long *restrict curptr0;
long *restrict refptr0;
long *restrict curptr2;
long *restrict refptr2;
curptr0 = (long*)cur;
refptr0 = (long*)ref;
curptr2 = (long*)(cur + (stride << 3));
refptr2 = (long*)(ref + (stride << 3));
SAD16V(); SAD16V(); SAD16V(); SAD16V();
SAD16V(); SAD16V(); SAD16V(); SAD16Vs();
sad[0] = sad0;
sad[1] = sad1;
sad[2] = sad2;
sad[3] = sad3;
return sad0+sad1+sad2+sad3;
/*
sad[0] = sad8_f(cur, ref, stride);
sad[1] = sad8_f(cur + 8, ref + 8, stride);
sad[2] = sad8_f(cur + 8*stride, ref + 8*stride, stride);
sad[3] = sad8_f(cur + 8*stride + 8, ref + 8*stride + 8, stride);
return sad[0]+sad[1]+sad[2]+sad[3];
// */
}
#define SUM16() \
tmp0 = UME8UU((curptr)[0], ZERO); \
tmp1 = UME8UU((curptr)[1], ZERO); \
tmp2 = UME8UU((curptr)[2], ZERO); \
tmp3 = UME8UU((curptr)[3], ZERO); \
sum += (tmp0 + tmp1) + (tmp2 + tmp3); \
curptr += stride_scale_4;
#define SUM16s() \
tmp0 = UME8UU((curptr)[0], ZERO); \
tmp1 = UME8UU((curptr)[1], ZERO); \
tmp2 = UME8UU((curptr)[2], ZERO); \
tmp3 = UME8UU((curptr)[3], ZERO); \
sum += (tmp0 + tmp1) + (tmp2 + tmp3);
#define MEAN16() \
mean0 = ASRI(8, sum); \
mean1 = PACKBYTES( mean0, mean0 ); \
mean2 = PACKBYTES( mean0, mean0 ); \
mean = PACK16LSB( mean1, mean2 );
#define DEV16() \
tmp0 = UME8UU((curptr)[0], mean); \
tmp1 = UME8UU((curptr)[1], mean); \
tmp2 = UME8UU((curptr)[2], mean); \
tmp3 = UME8UU((curptr)[3], mean); \
dev += (tmp0 + tmp1) + (tmp2 + tmp3); \
curptr += stride_scale_4;
#define DEV16s() \
tmp0 = UME8UU((curptr)[0], mean); \
tmp1 = UME8UU((curptr)[1], mean); \
tmp2 = UME8UU((curptr)[2], mean); \
tmp3 = UME8UU((curptr)[3], mean); \
dev += (tmp0 + tmp1) + (tmp2 + tmp3);
static __inline int dev16_f(const uint8_t* const cur, const int32_t stride)
{
#pragma TCS_no_caller_save
int tmp0, tmp1, tmp2, tmp3;
int sum = 0;
int dev = 0;
int mean, mean0, mean1, mean2;
const int stride_scale_4 = stride>>2;
long *restrict curptr;
curptr = (long*)cur;
SUM16(); SUM16(); SUM16(); SUM16();
SUM16(); SUM16(); SUM16(); SUM16();
SUM16(); SUM16(); SUM16(); SUM16();
SUM16(); SUM16(); SUM16(); SUM16s();
MEAN16();
curptr = (long*)cur;
DEV16(); DEV16(); DEV16(); DEV16();
DEV16(); DEV16(); DEV16(); DEV16();
DEV16(); DEV16(); DEV16(); DEV16();
DEV16(); DEV16(); DEV16(); DEV16s();
return dev;
/*
int32_t i, j;
int32_t mean = 0;
int32_t dev = 0;
uint8_t const *ptr_cur = cur;
for (j = 0; j < 16; j++)
{
for (i = 0; i < 16; i++)
mean += *(ptr_cur + i);
ptr_cur += stride;
}
mean /= (16 * 16);
ptr_cur = cur;
for (j = 0; j < 16; j++)
{
for (i = 0; i < 16; i++)
dev += abs(*(ptr_cur + i) - (int32_t) mean);
ptr_cur += stride;
}
return dev;
// */
}
#endif