MCODEC - 感恩的心,感谢生命中的每一个人

(本站所有文章都是原创,转载请注明出处)
  首页  :: 联系 :: 订阅 订阅  :: 管理

tm1500 SAD 优化算法

Posted on 2009-10-24 09:09  mcodec  阅读(406)  评论(0编辑  收藏  举报

 

tm1500 SAD 优化算法,应用于MPEG4视频编解码算法中。


#ifndef _ENCODER_SAD_H_
#define _ENCODER_SAD_H_

#define SCALETWOBIT 2
#define SCALEONEBIT 1
#define ZERO  0

#define SAD16_1()  \
 s0 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
 s1 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
 s2 = FUNSHIFT3((refptr)[3], (refptr)[2]); \
 s3 = FUNSHIFT3((refptr)[4], (refptr)[3]); \
 tmp0 = UME8UU((curptr)[0], s0); \
 tmp1 = UME8UU((curptr)[1], s1); \
 tmp2 = UME8UU((curptr)[2], s2); \
 tmp3 = UME8UU((curptr)[3], s3); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;  \
 curptr += stride_scale_4;   \
 refptr += stride_scale_4;

#define SAD16_1s()  \
 s0 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
 s1 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
 s2 = FUNSHIFT3((refptr)[3], (refptr)[2]); \
 s3 = FUNSHIFT3((refptr)[4], (refptr)[3]); \
 tmp0 = UME8UU((curptr)[0], s0); \
 tmp1 = UME8UU((curptr)[1], s1); \
 tmp2 = UME8UU((curptr)[2], s2); \
 tmp3 = UME8UU((curptr)[3], s3); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;
 
#define SAD16_2()  \
 s0 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
 s1 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
 s2 = FUNSHIFT2((refptr)[3], (refptr)[2]); \
 s3 = FUNSHIFT2((refptr)[4], (refptr)[3]); \
 tmp0 = UME8UU((curptr)[0], s0); \
 tmp1 = UME8UU((curptr)[1], s1); \
 tmp2 = UME8UU((curptr)[2], s2); \
 tmp3 = UME8UU((curptr)[3], s3); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;  \
 curptr += stride_scale_4;   \
 refptr += stride_scale_4;

#define SAD16_2s()  \
 s0 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
 s1 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
 s2 = FUNSHIFT2((refptr)[3], (refptr)[2]); \
 s3 = FUNSHIFT2((refptr)[4], (refptr)[3]); \
 tmp0 = UME8UU((curptr)[0], s0); \
 tmp1 = UME8UU((curptr)[1], s1); \
 tmp2 = UME8UU((curptr)[2], s2); \
 tmp3 = UME8UU((curptr)[3], s3); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;
 
#define SAD16_3()  \
 s0 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
 s1 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
 s2 = FUNSHIFT1((refptr)[3], (refptr)[2]); \
 s3 = FUNSHIFT1((refptr)[4], (refptr)[3]); \
 tmp0 = UME8UU((curptr)[0], s0); \
 tmp1 = UME8UU((curptr)[1], s1); \
 tmp2 = UME8UU((curptr)[2], s2); \
 tmp3 = UME8UU((curptr)[3], s3); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;  \
 curptr += stride_scale_4;   \
 refptr += stride_scale_4;

#define SAD16_3s()  \
 s0 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
 s1 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
 s2 = FUNSHIFT1((refptr)[3], (refptr)[2]); \
 s3 = FUNSHIFT1((refptr)[4], (refptr)[3]); \
 tmp0 = UME8UU((curptr)[0], s0); \
 tmp1 = UME8UU((curptr)[1], s1); \
 tmp2 = UME8UU((curptr)[2], s2); \
 tmp3 = UME8UU((curptr)[3], s3); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;

#define SAD16()  \
 tmp0 = UME8UU((curptr)[0], (refptr)[0]); \
 tmp1 = UME8UU((curptr)[1], (refptr)[1]); \
 tmp2 = UME8UU((curptr)[2], (refptr)[2]); \
 tmp3 = UME8UU((curptr)[3], (refptr)[3]); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;  \
 curptr += stride_scale_4;   \
 refptr += stride_scale_4;

#define SAD16s()  \
 tmp0 = UME8UU((curptr)[0], (refptr)[0]); \
 tmp1 = UME8UU((curptr)[1], (refptr)[1]); \
 tmp2 = UME8UU((curptr)[2], (refptr)[2]); \
 tmp3 = UME8UU((curptr)[3], (refptr)[3]); \
 sad += (tmp0 + tmp1) + (tmp2 + tmp3) ;    
 
static __inline int sad16_f(const uint8_t * const cur, const uint8_t * const ref, const int32_t stride)
{
#pragma TCS_no_caller_save
 int s0, s1, s2, s3;
 int tmp0, tmp1, tmp2, tmp3;
 int sad = 0;

 const int stride_scale_4 = stride>>2 ;

 const int ref1 = (int) ref & ~3; 
 const int ref2 = (int) ref & 3;

 long *restrict curptr;
 long *restrict refptr;  
 curptr = (long*)cur;
 refptr = (long*)ref1;
 
 switch(ref2)
 {
 case 0:
  SAD16(); SAD16(); SAD16(); SAD16();   
  SAD16(); SAD16(); SAD16(); SAD16();    
  SAD16(); SAD16(); SAD16(); SAD16();   
  SAD16(); SAD16(); SAD16(); SAD16s();
  return sad;  
 case 1:
  SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1();   
  SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1();   
  SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1();   
  SAD16_1(); SAD16_1(); SAD16_1(); SAD16_1s();
  return sad;    
 case 2:
  SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2();   
  SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2();   
  SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2();   
  SAD16_2(); SAD16_2(); SAD16_2(); SAD16_2s();  
  return sad;
 default:
  SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3();   
  SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3();   
  SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3();   
  SAD16_3(); SAD16_3(); SAD16_3(); SAD16_3s();
  return sad;    
 }
/*
 int32_t j;
 int32_t sad = 0;

 uint8_t const *ptr_cur = cur;
 uint8_t const *ptr_ref = ref;

 for (j = 0; j < 16; j++)
 {
  sad += abs(ptr_cur[0] - ptr_ref[0]);
  sad += abs(ptr_cur[1] - ptr_ref[1]);
  sad += abs(ptr_cur[2] - ptr_ref[2]);
  sad += abs(ptr_cur[3] - ptr_ref[3]);
  sad += abs(ptr_cur[4] - ptr_ref[4]);
  sad += abs(ptr_cur[5] - ptr_ref[5]);
  sad += abs(ptr_cur[6] - ptr_ref[6]);
  sad += abs(ptr_cur[7] - ptr_ref[7]);
  sad += abs(ptr_cur[8] - ptr_ref[8]);
  sad += abs(ptr_cur[9] - ptr_ref[9]);
  sad += abs(ptr_cur[10] - ptr_ref[10]);
  sad += abs(ptr_cur[11] - ptr_ref[11]);
  sad += abs(ptr_cur[12] - ptr_ref[12]);
  sad += abs(ptr_cur[13] - ptr_ref[13]);
  sad += abs(ptr_cur[14] - ptr_ref[14]);
  sad += abs(ptr_cur[15] - ptr_ref[15]);

//  if (sad >= best_sad)
//   return sad;

  ptr_cur += stride;
  ptr_ref += stride;
 }
 return sad;
// */
}

#define SAD8()  \
 tmp0 = (refptr)[0]; \
 tmp1 = (refptr)[1]; \
 sad += UME8UU((curptr)[0], tmp0) + UME8UU((curptr)[1], tmp1); \
 curptr += stride_scale_4;   \
 refptr += stride_scale_4;

#define SAD8s()  \
 tmp0 = (refptr)[0]; \
 tmp1 = (refptr)[1]; \
 sad += UME8UU((curptr)[0], tmp0) + UME8UU((curptr)[1], tmp1);  

static __inline int sad8_f(const uint8_t * const cur, const uint8_t * const ref, const int32_t stride)
{
#pragma TCS_no_caller_save
 int tmp0, tmp1;
 int sad = 0;

 const int stride_scale_4 = stride>>2;

 long *restrict curptr;
 long *restrict refptr; 
 curptr = (long*)cur;
 refptr = (long*)ref;

 SAD8(); SAD8(); SAD8(); SAD8();   
 SAD8(); SAD8(); SAD8(); SAD8s();

 return sad;
/*
 int32_t j;
 int32_t sad = 0;

 uint8_t const *ptr_cur = cur;
 uint8_t const *ptr_ref = ref;

 for (j = 0; j < 8; j++)
 {
  sad += abs(ptr_cur[0] - ptr_ref[0]);
  sad += abs(ptr_cur[1] - ptr_ref[1]);
  sad += abs(ptr_cur[2] - ptr_ref[2]);
  sad += abs(ptr_cur[3] - ptr_ref[3]);
  sad += abs(ptr_cur[4] - ptr_ref[4]);
  sad += abs(ptr_cur[5] - ptr_ref[5]);
  sad += abs(ptr_cur[6] - ptr_ref[6]);
  sad += abs(ptr_cur[7] - ptr_ref[7]);

  ptr_cur += stride;
  ptr_ref += stride;
 }
 return sad;
// */
}

#define SAD16V() \
 tmp0 = UME8UU((curptr0)[0], (refptr0)[0]); \
 tmp1 = UME8UU((curptr0)[1], (refptr0)[1]); \
 sad0 += (tmp0 + tmp1);  \
 tmp2 = UME8UU((curptr0)[2], (refptr0)[2]); \
 tmp3 = UME8UU((curptr0)[3], (refptr0)[3]); \
 sad1 += (tmp2 + tmp3);  \
 tmp4 = UME8UU((curptr2)[0], (refptr2)[0]); \
 tmp5 = UME8UU((curptr2)[1], (refptr2)[1]); \
 sad2 += (tmp4 + tmp5);  \
 tmp6 = UME8UU((curptr2)[2], (refptr2)[2]); \
 tmp7 = UME8UU((curptr2)[3], (refptr2)[3]); \
 sad3 += (tmp6 + tmp7);  \
 curptr0 += stride_scale_4; \
 refptr0 += stride_scale_4; \
 curptr2 += stride_scale_4; \
 refptr2 += stride_scale_4;

#define SAD16Vs() \
 tmp0 = UME8UU((curptr0)[0], (refptr0)[0]); \
 tmp1 = UME8UU((curptr0)[1], (refptr0)[1]); \
 sad0 += (tmp0 + tmp1);  \
 tmp2 = UME8UU((curptr0)[2], (refptr0)[2]); \
 tmp3 = UME8UU((curptr0)[3], (refptr0)[3]); \
 sad1 += (tmp2 + tmp3);  \
 tmp4 = UME8UU((curptr2)[0], (refptr2)[0]); \
 tmp5 = UME8UU((curptr2)[1], (refptr2)[1]); \
 sad2 += (tmp4 + tmp5);  \
 tmp6 = UME8UU((curptr2)[2], (refptr2)[2]); \
 tmp7 = UME8UU((curptr2)[3], (refptr2)[3]); \
 sad3 += (tmp6 + tmp7);
 
static __inline int sad16v_f(const uint8_t * const cur, const uint8_t * const ref, const int32_t stride, int *sad)
{
#pragma TCS_no_caller_save
 int tmp0, tmp1, tmp2, tmp3;
 int tmp4, tmp5, tmp6, tmp7;

 int sad0 = 0;
 int sad1 = 0;
 int sad2 = 0;
 int sad3 = 0;

 const int stride_scale_4 = stride >> 2; // ASRI(SCALETWOBIT,stride); // convert 1 byte stride to 4 byte stride ;
 
 long *restrict curptr0;
 long *restrict refptr0;
 long *restrict curptr2;
 long *restrict refptr2; 
 curptr0 = (long*)cur;
 refptr0 = (long*)ref; 
 curptr2 = (long*)(cur + (stride << 3));
 refptr2 = (long*)(ref + (stride << 3));

 
 SAD16V(); SAD16V(); SAD16V(); SAD16V();
   
 SAD16V(); SAD16V(); SAD16V(); SAD16Vs();
 
 sad[0] = sad0;
 sad[1] = sad1;
 sad[2] = sad2;
 sad[3] = sad3;

 return sad0+sad1+sad2+sad3; 
/*
 sad[0] = sad8_f(cur, ref, stride);
 sad[1] = sad8_f(cur + 8, ref + 8, stride);
 sad[2] = sad8_f(cur + 8*stride, ref + 8*stride, stride);
 sad[3] = sad8_f(cur + 8*stride + 8, ref + 8*stride + 8, stride);

 return sad[0]+sad[1]+sad[2]+sad[3];
// */ 
}

#define SUM16()  \
 tmp0 = UME8UU((curptr)[0], ZERO); \
 tmp1 = UME8UU((curptr)[1], ZERO); \
 tmp2 = UME8UU((curptr)[2], ZERO); \
 tmp3 = UME8UU((curptr)[3], ZERO); \
 sum += (tmp0 + tmp1) + (tmp2 + tmp3); \
 curptr += stride_scale_4;

#define SUM16s()  \
 tmp0 = UME8UU((curptr)[0], ZERO); \
 tmp1 = UME8UU((curptr)[1], ZERO); \
 tmp2 = UME8UU((curptr)[2], ZERO); \
 tmp3 = UME8UU((curptr)[3], ZERO); \
 sum += (tmp0 + tmp1) + (tmp2 + tmp3);

#define MEAN16() \
 mean0 = ASRI(8, sum);   \
 mean1 = PACKBYTES( mean0, mean0 );  \
 mean2 = PACKBYTES( mean0, mean0 );  \
 mean = PACK16LSB( mean1, mean2 );
 
#define DEV16()  \
 tmp0 = UME8UU((curptr)[0], mean); \
 tmp1 = UME8UU((curptr)[1], mean); \
 tmp2 = UME8UU((curptr)[2], mean); \
 tmp3 = UME8UU((curptr)[3], mean); \
 dev += (tmp0 + tmp1) + (tmp2 + tmp3); \
 curptr += stride_scale_4;

#define DEV16s()  \
 tmp0 = UME8UU((curptr)[0], mean); \
 tmp1 = UME8UU((curptr)[1], mean); \
 tmp2 = UME8UU((curptr)[2], mean); \
 tmp3 = UME8UU((curptr)[3], mean); \
 dev += (tmp0 + tmp1) + (tmp2 + tmp3);

static __inline int dev16_f(const uint8_t* const cur, const int32_t stride)
{
#pragma TCS_no_caller_save
 int tmp0, tmp1, tmp2, tmp3;
 int sum = 0;
 int dev = 0;
 int mean, mean0, mean1, mean2;

 const int stride_scale_4 = stride>>2;

 long *restrict curptr;  
 curptr = (long*)cur;

 SUM16(); SUM16(); SUM16(); SUM16(); 
 SUM16(); SUM16(); SUM16(); SUM16(); 
 SUM16(); SUM16(); SUM16(); SUM16(); 
 SUM16(); SUM16(); SUM16(); SUM16s();
 
 MEAN16();
 curptr = (long*)cur;
 
 DEV16(); DEV16(); DEV16(); DEV16(); 
 DEV16(); DEV16(); DEV16(); DEV16(); 
 DEV16(); DEV16(); DEV16(); DEV16(); 
 DEV16(); DEV16(); DEV16(); DEV16s();
 
 return dev; 
/*
 int32_t i, j;

 int32_t mean = 0;
 int32_t dev = 0;

 uint8_t const *ptr_cur = cur;

 for (j = 0; j < 16; j++)
 {
  for (i = 0; i < 16; i++)
   mean += *(ptr_cur + i);
  ptr_cur += stride;
 }

 mean /= (16 * 16);
 ptr_cur = cur;

 for (j = 0; j < 16; j++)
 {
  for (i = 0; i < 16; i++)
   dev += abs(*(ptr_cur + i) - (int32_t) mean);

  ptr_cur += stride;
 }

 return dev;
// */
}

#endif