MCODEC - 感恩的心,感谢生命中的每一个人

(本站所有文章都是原创,转载请注明出处)
  首页  :: 联系 :: 订阅 订阅  :: 管理

tm1500 内存传输优化算法

Posted on 2009-10-24 09:05  mcodec  阅读(215)  评论(0编辑  收藏  举报

 

tm1500 内存传输优化算法,应用于mpeg4视频编解码算法中。


#ifndef _MEM_TRANSFER_H
#define _MEM_TRANSFER_H

#define SCALETWOBIT 2
#define SCALEONEBIT 1

#define ZERO 0

#define MERGE_LSB(a, b)   MERGELSB(a, b)
#define MERGE_MSB(a, b)   MERGEMSB(a, b)
#define MERGEDUAL16_LSB(a, b) MERGEDUAL16LSB(b, a)


#define COPY16TO16()  \
 (dstptr)[0] = (srcptr)[0]; \
 (dstptr)[1] = (srcptr)[1]; \
 (dstptr)[2] = (srcptr)[2]; \
 (dstptr)[3] = (srcptr)[3]; \
 dstptr += stride_scale_4; \
 srcptr += stride_scale_4;

#define COPY16TO16s()  \
 (dstptr)[0] = (srcptr)[0]; \
 (dstptr)[1] = (srcptr)[1]; \
 (dstptr)[2] = (srcptr)[2]; \
 (dstptr)[3] = (srcptr)[3];
 
static __inline void copy8to8_16x16(uint8_t * const dst, const uint8_t * const src, const int32_t stride)
{
#pragma TCS_no_caller_save

 const int stride_scale_4 = stride>>2;

 long *restrict dstptr;
 long *restrict srcptr; 
 dstptr = (long*)dst;
 srcptr = (long*)src;
 
 COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16(); 
 COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16(); 
 COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16(); 
 COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16s();
/*
 copy8to8_8x8(dst, src, stride);
 copy8to8_8x8(dst + 8, src + 8, stride);
 copy8to8_8x8(dst + 8*stride, src + 8*stride, stride);
 copy8to8_8x8(dst + 8*stride + 8, src + 8*stride + 8, stride);
// */
}

#define COPY8TO8()  \
 (dstptr)[0] = (srcptr)[0]; \
 (dstptr)[1] = (srcptr)[1]; \
 dstptr += stride_scale_4; \
 srcptr += stride_scale_4;

#define COPY8TO8s()  \
 (dstptr)[0] = (srcptr)[0]; \
 (dstptr)[1] = (srcptr)[1];

static __inline void copy8to8_8x8(uint8_t * const dst, const uint8_t * const src, const int32_t stride)
{
#pragma TCS_no_caller_save

 const int stride_scale_4 = stride>>2;

 long *restrict dstptr;
 long *restrict srcptr; 
 dstptr = (long*)dst;
 srcptr = (long*)src;

 COPY8TO8(); COPY8TO8(); COPY8TO8(); COPY8TO8(); 
 COPY8TO8(); COPY8TO8(); COPY8TO8(); COPY8TO8s();
/*
 int32_t j;

 for (j = 0; j < 8; j++)
 {
  uint32_t *d= (uint32_t*)(dst + j*stride);
  const uint32_t *s = (const uint32_t*)(src + j*stride);
  *(d+0) = *(s+0);
  *(d+1) = *(s+1);
 }
// */
}

#define COPY16TO8() \
 tmp0 = DUALUCLIPI((srcptr)[0], 255); \
 tmp1 = DUALUCLIPI((srcptr)[1], 255); \
 tmp2 = DUALUCLIPI((srcptr)[2], 255); \
 tmp3 = DUALUCLIPI((srcptr)[3], 255); \
 (dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
 (dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3); \
 dstptr += stride_scale_4; \
 srcptr += 4;

#define COPY16TO8s() \
 tmp0 = DUALUCLIPI((srcptr)[0], 255); \
 tmp1 = DUALUCLIPI((srcptr)[1], 255); \
 tmp2 = DUALUCLIPI((srcptr)[2], 255); \
 tmp3 = DUALUCLIPI((srcptr)[3], 255); \
 (dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
 (dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3);

static __inline void copy16to8_8x8(uint8_t* const dst, const short * const src, int32_t stride)
{
#pragma TCS_no_caller_save 
 int tmp0, tmp1, tmp2, tmp3;

 const int stride_scale_4 = stride>>2;

 long *restrict dstptr;
 long *restrict srcptr; 
 dstptr = (long*)dst;
 srcptr = (long*)src;

 COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8(); 
 COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8s();
/*
 int32_t i, j;

 for (j = 0; j < 8; j++)
 {
  for (i = 0; i < 8; i++)
  {
   int16_t pixel = src[j * 8 + i];

   if (pixel < 0)
   {
    pixel = 0;
   }
   else if (pixel > 255)
   {
    pixel = 255;
   }
   dst[j * stride + i] = (uint8_t) pixel;
  }
 }
// */ 
}

#define ADD16TO8() \
 tmp0 = MERGE_LSB(ZERO,(dstptr)[0]); \
 tmp1 = MERGE_MSB(ZERO,(dstptr)[0]); \
 i0 = DSPIDUALADD( tmp0, (srcptr)[0]); \
 i1 = DSPIDUALADD( tmp1, (srcptr)[1]); \
 tmp0 = DUALUCLIPI( i0, 255 ); \
 tmp1 = DUALUCLIPI( i1, 255 ); \
 tmp2 = MERGE_LSB(ZERO,(dstptr)[1]); \
 tmp3 = MERGE_MSB(ZERO,(dstptr)[1]); \
 i0   = DSPIDUALADD( tmp2, (srcptr)[2]); \
 i1   = DSPIDUALADD( tmp3, (srcptr)[3]); \
 tmp2 = DUALUCLIPI( i0, 255 ); \
 tmp3 = DUALUCLIPI( i1, 255 ); \
 (dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
 (dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3); \
 dstptr += stride_scale_4; \
 srcptr += 4;

#define ADD16TO8s() \
 tmp0 = MERGE_LSB(ZERO,(dstptr)[0]); \
 tmp1 = MERGE_MSB(ZERO,(dstptr)[0]); \
 i0 = DSPIDUALADD( tmp0, (srcptr)[0]); \
 i1 = DSPIDUALADD( tmp1, (srcptr)[1]); \
 tmp0 = DUALUCLIPI( i0, 255 ); \
 tmp1 = DUALUCLIPI( i1, 255 ); \
 tmp2 = MERGE_LSB(ZERO,(dstptr)[1]); \
 tmp3 = MERGE_MSB(ZERO,(dstptr)[1]); \
 i0   = DSPIDUALADD( tmp2, (srcptr)[2]); \
 i1   = DSPIDUALADD( tmp3, (srcptr)[3]); \
 tmp2 = DUALUCLIPI( i0, 255 ); \
 tmp3 = DUALUCLIPI( i1, 255 ); \
 (dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
 (dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3);

static __inline void add16to8_8x8(uint8_t * const dst, const short * const src, int32_t stride)
{
#pragma TCS_no_caller_save
 int tmp0, tmp1, tmp2, tmp3;
 int i0, i1;

 const int stride_scale_4 = stride>>2;

 long *restrict dstptr;
 long *restrict srcptr; 
 dstptr = (long*)dst;
 srcptr = (long*)src;

 ADD16TO8(); ADD16TO8(); ADD16TO8(); ADD16TO8(); 
 ADD16TO8(); ADD16TO8(); ADD16TO8(); ADD16TO8s();
/*
 int32_t i, j;

 for (j = 0; j < 8; j++)
 {
  for (i = 0; i < 8; i++)
  {
   int16_t pixel = (int16_t) dst[j * stride + i] + src[j * 8 + i];

   if (pixel < 0)
   {
    pixel = 0;
   }
   else if (pixel > 255)
   {
    pixel = 255;
   }
   dst[j * stride + i] = (uint8_t) pixel;
  }
 }
// */
}

#define COPY8TO16() \
 tmp0 = (srcptr)[0]; \
 tmp1 = (srcptr)[1]; \
 (dstptr)[0] = MERGE_LSB(ZERO,tmp0); \
 (dstptr)[1] = MERGE_MSB(ZERO,tmp0); \
 (dstptr)[2] = MERGE_LSB(ZERO,tmp1); \
 (dstptr)[3] = MERGE_MSB(ZERO,tmp1); \
 dstptr += 4; \
 srcptr += stride_scale_4;

#define COPY8TO16s() \
 tmp0 = (srcptr)[0]; \
 tmp1 = (srcptr)[1]; \
 (dstptr)[0] = MERGE_LSB(ZERO,tmp0); \
 (dstptr)[1] = MERGE_MSB(ZERO,tmp0); \
 (dstptr)[2] = MERGE_LSB(ZERO,tmp1); \
 (dstptr)[3] = MERGE_MSB(ZERO,tmp1); 
 
static __inline void copy8to16_8x8(short * const dst, const uint8_t* const src, int32_t stride)
{
#pragma TCS_no_caller_save
 int tmp0, tmp1;

 const int stride_scale_4 = stride>>2;

 long *restrict dstptr;
 long *restrict srcptr; 
 dstptr = (long*)dst;
 srcptr = (long*)src;

 COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16();
 COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16s(); 
/*
 int32_t i, j;

 for (j = 0; j < 8; j++)
 {
  for (i = 0; i < 8; i++)
   dst[j * 8 + i] = (int16_t) src[j * stride + i];
 }
// */
}

#define SUB8TO16_1()  \
 tmp2 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
 tmp3 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
 curptr += stride_scale_4; \
 refptr += stride_scale_4; \
 dstptr += 4;

#define SUB8TO16_1s()  \
 tmp2 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
 tmp3 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));

#define SUB8TO16_2()  \
 tmp2 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
 tmp3 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
 curptr += stride_scale_4; \
 refptr += stride_scale_4; \
 dstptr += 4;

#define SUB8TO16_2s()  \
 tmp2 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
 tmp3 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));

#define SUB8TO16_3()  \
 tmp2 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
 tmp3 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
 curptr += stride_scale_4; \
 refptr += stride_scale_4; \
 dstptr += 4;
   
#define SUB8TO16_3s()  \
 tmp2 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
 tmp3 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));

#define SUB8TO16()  \
 tmp2 = (refptr)[0]; \
 tmp3 = (refptr)[1]; \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
 curptr += stride_scale_4; \
 refptr += stride_scale_4; \
 dstptr += 4;

#define SUB8TO16s()  \
 tmp2 = (refptr)[0]; \
 tmp3 = (refptr)[1]; \
 tmp0 = (curptr)[0]; \
 tmp1 = (curptr)[1]; \
 (curptr)[0] = tmp2; \
 (curptr)[1] = tmp3; \
 (dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
 (dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
 (dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
 (dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));

static __inline void sub8to16_8x8(short * const dst, uint8_t * const cur, const uint8_t* ref, const int32_t stride)

#pragma TCS_no_caller_save 

 int tmp0, tmp1, tmp2, tmp3;

 const int stride_scale_4 = stride>>2;

 int ref1 = (int) ref & ~3; 
 int ref2 = (int) ref & 3;

 long *restrict dstptr;
 long *restrict curptr;
 long *restrict refptr; 
 dstptr = (long*)dst;
 curptr = (long*)cur;
 refptr = (long*)ref1;
 
 switch (ref2)
 {
 case 1:
  SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1();  
  SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1s();
  return;
 case 2:
  SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2();  
  SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2s();
  return;
 case 3:
  SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3();  
  SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3s();
  return;
 default:
  SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16();
  SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16s();
  return;
 }
/*
 int32_t i, j;

 for (j = 0; j < 8; j++)
 {
  for (i = 0; i < 8; i++)
  {
   uint8_t c = cur[j * stride + i];
   uint8_t r = ref[j * stride + i];

   cur[j * stride + i] = r;
   dct[j * 8 + i] = (int16_t) c - (int16_t) r;
  }
 }
// */
}

static __inline void sub8to16_16x16(int16_t * dst, uint8_t * const cur, const uint8_t * ref, const int32_t stride)
{
 int stride8 = stride * 8;

 sub8to16_8x8(dst, cur, ref, stride);
 sub8to16_8x8(dst+64, cur+8, ref+8, stride);
 sub8to16_8x8(dst+128, cur+stride8, ref+stride8, stride);
 sub8to16_8x8(dst+192, cur+stride8+8, ref+stride8+8, stride);
}

#endif