MCODEC - 感恩的心,感谢生命中的每一个人

(本站所有文章都是原创,转载请注明出处)
  首页  :: 联系 :: 订阅 订阅  :: 管理

tm1500 半像素插值优化算法

Posted on 2009-10-24 09:02  mcodec  阅读(321)  评论(0编辑  收藏  举报

 

应用于 Philips Trimedia 1500系列SOC MPEG4视频编解码算法中,半像素插值快速算法


#ifndef _INTERPOLATE8X8_H_
#define _INTERPOLATE8X8_H_

#define HALFPEL_H_RND0() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t0 = PACKBYTES(src1[2], src1[1]); \
 t1 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[6], src1[5]); \
 t3 = PACKBYTES(src1[8], src1[7]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG(s0, q0); \
 dstptr[1] = QUADAVG(s1, q1); \
 src1 += stride; \
 dstptr += stride_scale_4;

#define HALFPEL_H_RND0s() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t0 = PACKBYTES(src1[2], src1[1]); \
 t1 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[6], src1[5]); \
 t3 = PACKBYTES(src1[8], src1[7]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG(s0, q0); \
 dstptr[1] = QUADAVG(s1, q1);

#define HALFPEL_H_RND1() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t0 = PACKBYTES(src1[2], src1[1]); \
 t1 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[6], src1[5]); \
 t3 = PACKBYTES(src1[8], src1[7]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG0(s0, q0); \
 dstptr[1] = QUADAVG0(s1, q1); \
 src1 += stride; \
 dstptr += stride_scale_4;

#define HALFPEL_H_RND1s() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t0 = PACKBYTES(src1[2], src1[1]); \
 t1 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[6], src1[5]); \
 t3 = PACKBYTES(src1[8], src1[7]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG0(s0, q0); \
 dstptr[1] = QUADAVG0(s1, q1);

static __inline void halfpel_h(uint8_t * const dst, const uint8_t * const src, const int32_t stride, const int32_t rounding)
{
#pragma TCS_no_caller_save
 int r0, r1, r2, r3;
 int t0, t1, t2, t3;
 int s0, s1, q0, q1;

 const int stride_scale_4 = stride >> 2;
  
 long *restrict dstptr;
 const unsigned char *restrict src1; 
 dstptr = (long*)dst;
 src1 = src;

 if(rounding)
 {
  HALFPEL_H_RND1();
  HALFPEL_H_RND1();
  HALFPEL_H_RND1();
  HALFPEL_H_RND1();
  
  HALFPEL_H_RND1();
  HALFPEL_H_RND1();
  HALFPEL_H_RND1();
  HALFPEL_H_RND1s();
 }
 else
 {
  HALFPEL_H_RND0();
  HALFPEL_H_RND0();
  HALFPEL_H_RND0();
  HALFPEL_H_RND0();
  
  HALFPEL_H_RND0();
  HALFPEL_H_RND0();
  HALFPEL_H_RND0();
  HALFPEL_H_RND0s();
 }
/*
 int32_t j;
 int32_t r=1-rounding;

 for (j = 0; j < 8*stride; j+=stride)
 {
  dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] + r )>>1);
  dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] + r )>>1);
  dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] + r )>>1);
  dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] + r )>>1);
  dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] + r )>>1);
  dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] + r )>>1);
  dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] + r )>>1);
  dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] + r )>>1);
 }
// */
}

#define HALFPEL_V_RND0() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG(s0, q0); \
 dstptr[1] = QUADAVG(s1, q1); \
 dstptr += stride_scale_4; \
 src1 += stride;

#define HALFPEL_V_RND0s() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG(s0, q0); \
 dstptr[1] = QUADAVG(s1, q1);

#define HALFPEL_V_RND1() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG0(s0, q0); \
 dstptr[1] = QUADAVG0(s1, q1); \
 dstptr += stride_scale_4; \
 src1 += stride; 

#define HALFPEL_V_RND1s() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[3], src1[2]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 s0 = PACK16LSB(r1, r0); \
 q0 = PACK16LSB(t1, t0); \
 r2 = PACKBYTES(src1[5], src1[4]); \
 r3 = PACKBYTES(src1[7], src1[6]); \
 t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 s1 = PACK16LSB(r3, r2); \
 q1 = PACK16LSB(t3, t2); \
 dstptr[0] = QUADAVG0(s0, q0); \
 dstptr[1] = QUADAVG0(s1, q1);

static __inline void halfpel_v(uint8_t * const dst, const uint8_t * const src, const int32_t stride, const int32_t rounding)
{
#pragma TCS_no_caller_save
 int r0, r1, r2, r3;
 int t0, t1, t2, t3;
 int s0, s1, q0, q1;

 const int stride_scale_4 = stride >> 2;
  
 long *restrict dstptr;
 const unsigned char *restrict src1;
 dstptr = (long*)dst;
 src1 = src;

 if(rounding)
 {
  HALFPEL_V_RND1();
  HALFPEL_V_RND1();
  HALFPEL_V_RND1();
  HALFPEL_V_RND1();
  
  HALFPEL_V_RND1();
  HALFPEL_V_RND1();
  HALFPEL_V_RND1();
  HALFPEL_V_RND1s();
 }
 else
 {
  HALFPEL_V_RND0();
  HALFPEL_V_RND0();
  HALFPEL_V_RND0();
  HALFPEL_V_RND0();
  
  HALFPEL_V_RND0();
  HALFPEL_V_RND0();
  HALFPEL_V_RND0();
  HALFPEL_V_RND0s();
 }
/*
 int32_t j;
 int32_t r=1-rounding;

 for (j = 0; j < 8*stride; j+=stride)
 {
  dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] + r )>>1);
  dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] + r )>>1);
  dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] + r )>>1);
  dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] + r )>>1);
  dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] + r )>>1);
  dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] + r )>>1);
  dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] + r )>>1);
  dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] + r )>>1);
 }
// */
}

#define HALFPEL_HV_RND0() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[2], src1[1]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
 m0 = BILINEAR2(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
 r2 = PACKBYTES(src1[3], src1[2]); \
 r3 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
 m1 = BILINEAR2(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
 r4 = PACKBYTES(src1[5], src1[4]); \
 r5 = PACKBYTES(src1[6], src1[5]); \
 t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
 m2 = BILINEAR2(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
 r6 = PACKBYTES(src1[7], src1[6]); \
 r7 = PACKBYTES(src1[8], src1[7]); \
 t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
 m3 = BILINEAR2(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
 dstptr[0] = MERGEDUAL16LSB(m1, m0); \
 dstptr[1] = MERGEDUAL16LSB(m3, m2); \
 src1 += stride; \
 dstptr += stride_scale_4;

#define HALFPEL_HV_RND0s() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[2], src1[1]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
 m0 = BILINEAR2(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
 r2 = PACKBYTES(src1[3], src1[2]); \
 r3 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
 m1 = BILINEAR2(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
 r4 = PACKBYTES(src1[5], src1[4]); \
 r5 = PACKBYTES(src1[6], src1[5]); \
 t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
 m2 = BILINEAR2(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
 r6 = PACKBYTES(src1[7], src1[6]); \
 r7 = PACKBYTES(src1[8], src1[7]); \
 t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
 m3 = BILINEAR2(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
 dstptr[0] = MERGEDUAL16LSB(m1, m0); \
 dstptr[1] = MERGEDUAL16LSB(m3, m2);

#define HALFPEL_HV_RND1() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[2], src1[1]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
 m0 = BILINEAR1(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
 r2 = PACKBYTES(src1[3], src1[2]); \
 r3 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
 m1 = BILINEAR1(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
 r4 = PACKBYTES(src1[5], src1[4]); \
 r5 = PACKBYTES(src1[6], src1[5]); \
 t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
 m2 = BILINEAR1(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
 r6 = PACKBYTES(src1[7], src1[6]); \
 r7 = PACKBYTES(src1[8], src1[7]); \
 t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
 m3 = BILINEAR1(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
 dstptr[0] = MERGEDUAL16LSB(m1, m0); \
 dstptr[1] = MERGEDUAL16LSB(m3, m2); \
 src1 += stride; \
 dstptr += stride_scale_4;

#define HALFPEL_HV_RND1s() \
 r0 = PACKBYTES(src1[1], src1[0]); \
 r1 = PACKBYTES(src1[2], src1[1]); \
 t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
 t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
 m0 = BILINEAR1(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
 r2 = PACKBYTES(src1[3], src1[2]); \
 r3 = PACKBYTES(src1[4], src1[3]); \
 t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
 t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
 m1 = BILINEAR1(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
 r4 = PACKBYTES(src1[5], src1[4]); \
 r5 = PACKBYTES(src1[6], src1[5]); \
 t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
 t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
 m2 = BILINEAR1(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
 r6 = PACKBYTES(src1[7], src1[6]); \
 r7 = PACKBYTES(src1[8], src1[7]); \
 t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
 t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
 m3 = BILINEAR1(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
 dstptr[0] = MERGEDUAL16LSB(m1, m0); \
 dstptr[1] = MERGEDUAL16LSB(m3, m2);

static __inline void halfpel_hv(uint8_t * const dst, const uint8_t * const src, const int32_t stride, const int32_t rounding)
{
#pragma TCS_no_caller_save
 int r0, r1, r2, r3, r4, r5, r6, r7;
 int t0, t1, t2, t3, t4, t5, t6, t7;
 int m0, m1, m2, m3;
 
 const int stride_scale_4 = stride >> 2;
  
 long *restrict dstptr;
 const unsigned char *restrict src1;
 dstptr = (long*)dst;
 src1 = src;

 if(rounding)
 { 
  HALFPEL_HV_RND1();
  HALFPEL_HV_RND1();
  HALFPEL_HV_RND1();
  HALFPEL_HV_RND1();
  
  HALFPEL_HV_RND1();
  HALFPEL_HV_RND1();
  HALFPEL_HV_RND1();
  HALFPEL_HV_RND1s();
 }
 else
 {
  HALFPEL_HV_RND0();
  HALFPEL_HV_RND0();
  HALFPEL_HV_RND0();
  HALFPEL_HV_RND0();
  
  HALFPEL_HV_RND0();
  HALFPEL_HV_RND0();
  HALFPEL_HV_RND0();
  HALFPEL_HV_RND0s();
 }
/*
 int32_t j;
 int32_t r=2-rounding;

 for (j = 0; j < 8*stride; j+=stride)
 {
  dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] + r )>>2);
  dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] + r )>>2);
  dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] + r )>>2);
  dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] + r )>>2);
  dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] + r )>>2);
  dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] + r )>>2);
  dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] + r )>>2);
  dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] + r )>>2);
 }
// */
}

#endif