tm1500 内存传输优化算法,应用于mpeg4视频编解码算法中。
#ifndef _MEM_TRANSFER_H
#define _MEM_TRANSFER_H
#define SCALETWOBIT 2
#define SCALEONEBIT 1
#define ZERO 0
#define MERGE_LSB(a, b) MERGELSB(a, b)
#define MERGE_MSB(a, b) MERGEMSB(a, b)
#define MERGEDUAL16_LSB(a, b) MERGEDUAL16LSB(b, a)
#define COPY16TO16() \
(dstptr)[0] = (srcptr)[0]; \
(dstptr)[1] = (srcptr)[1]; \
(dstptr)[2] = (srcptr)[2]; \
(dstptr)[3] = (srcptr)[3]; \
dstptr += stride_scale_4; \
srcptr += stride_scale_4;
#define COPY16TO16s() \
(dstptr)[0] = (srcptr)[0]; \
(dstptr)[1] = (srcptr)[1]; \
(dstptr)[2] = (srcptr)[2]; \
(dstptr)[3] = (srcptr)[3];
static __inline void copy8to8_16x16(uint8_t * const dst, const uint8_t * const src, const int32_t stride)
{
#pragma TCS_no_caller_save
const int stride_scale_4 = stride>>2;
long *restrict dstptr;
long *restrict srcptr;
dstptr = (long*)dst;
srcptr = (long*)src;
COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16();
COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16();
COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16();
COPY16TO16(); COPY16TO16(); COPY16TO16(); COPY16TO16s();
/*
copy8to8_8x8(dst, src, stride);
copy8to8_8x8(dst + 8, src + 8, stride);
copy8to8_8x8(dst + 8*stride, src + 8*stride, stride);
copy8to8_8x8(dst + 8*stride + 8, src + 8*stride + 8, stride);
// */
}
#define COPY8TO8() \
(dstptr)[0] = (srcptr)[0]; \
(dstptr)[1] = (srcptr)[1]; \
dstptr += stride_scale_4; \
srcptr += stride_scale_4;
#define COPY8TO8s() \
(dstptr)[0] = (srcptr)[0]; \
(dstptr)[1] = (srcptr)[1];
static __inline void copy8to8_8x8(uint8_t * const dst, const uint8_t * const src, const int32_t stride)
{
#pragma TCS_no_caller_save
const int stride_scale_4 = stride>>2;
long *restrict dstptr;
long *restrict srcptr;
dstptr = (long*)dst;
srcptr = (long*)src;
COPY8TO8(); COPY8TO8(); COPY8TO8(); COPY8TO8();
COPY8TO8(); COPY8TO8(); COPY8TO8(); COPY8TO8s();
/*
int32_t j;
for (j = 0; j < 8; j++)
{
uint32_t *d= (uint32_t*)(dst + j*stride);
const uint32_t *s = (const uint32_t*)(src + j*stride);
*(d+0) = *(s+0);
*(d+1) = *(s+1);
}
// */
}
#define COPY16TO8() \
tmp0 = DUALUCLIPI((srcptr)[0], 255); \
tmp1 = DUALUCLIPI((srcptr)[1], 255); \
tmp2 = DUALUCLIPI((srcptr)[2], 255); \
tmp3 = DUALUCLIPI((srcptr)[3], 255); \
(dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
(dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3); \
dstptr += stride_scale_4; \
srcptr += 4;
#define COPY16TO8s() \
tmp0 = DUALUCLIPI((srcptr)[0], 255); \
tmp1 = DUALUCLIPI((srcptr)[1], 255); \
tmp2 = DUALUCLIPI((srcptr)[2], 255); \
tmp3 = DUALUCLIPI((srcptr)[3], 255); \
(dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
(dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3);
static __inline void copy16to8_8x8(uint8_t* const dst, const short * const src, int32_t stride)
{
#pragma TCS_no_caller_save
int tmp0, tmp1, tmp2, tmp3;
const int stride_scale_4 = stride>>2;
long *restrict dstptr;
long *restrict srcptr;
dstptr = (long*)dst;
srcptr = (long*)src;
COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8();
COPY16TO8(); COPY16TO8(); COPY16TO8(); COPY16TO8s();
/*
int32_t i, j;
for (j = 0; j < 8; j++)
{
for (i = 0; i < 8; i++)
{
int16_t pixel = src[j * 8 + i];
if (pixel < 0)
{
pixel = 0;
}
else if (pixel > 255)
{
pixel = 255;
}
dst[j * stride + i] = (uint8_t) pixel;
}
}
// */
}
#define ADD16TO8() \
tmp0 = MERGE_LSB(ZERO,(dstptr)[0]); \
tmp1 = MERGE_MSB(ZERO,(dstptr)[0]); \
i0 = DSPIDUALADD( tmp0, (srcptr)[0]); \
i1 = DSPIDUALADD( tmp1, (srcptr)[1]); \
tmp0 = DUALUCLIPI( i0, 255 ); \
tmp1 = DUALUCLIPI( i1, 255 ); \
tmp2 = MERGE_LSB(ZERO,(dstptr)[1]); \
tmp3 = MERGE_MSB(ZERO,(dstptr)[1]); \
i0 = DSPIDUALADD( tmp2, (srcptr)[2]); \
i1 = DSPIDUALADD( tmp3, (srcptr)[3]); \
tmp2 = DUALUCLIPI( i0, 255 ); \
tmp3 = DUALUCLIPI( i1, 255 ); \
(dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
(dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3); \
dstptr += stride_scale_4; \
srcptr += 4;
#define ADD16TO8s() \
tmp0 = MERGE_LSB(ZERO,(dstptr)[0]); \
tmp1 = MERGE_MSB(ZERO,(dstptr)[0]); \
i0 = DSPIDUALADD( tmp0, (srcptr)[0]); \
i1 = DSPIDUALADD( tmp1, (srcptr)[1]); \
tmp0 = DUALUCLIPI( i0, 255 ); \
tmp1 = DUALUCLIPI( i1, 255 ); \
tmp2 = MERGE_LSB(ZERO,(dstptr)[1]); \
tmp3 = MERGE_MSB(ZERO,(dstptr)[1]); \
i0 = DSPIDUALADD( tmp2, (srcptr)[2]); \
i1 = DSPIDUALADD( tmp3, (srcptr)[3]); \
tmp2 = DUALUCLIPI( i0, 255 ); \
tmp3 = DUALUCLIPI( i1, 255 ); \
(dstptr)[0] = MERGEDUAL16_LSB(tmp0, tmp1); \
(dstptr)[1] = MERGEDUAL16_LSB(tmp2, tmp3);
static __inline void add16to8_8x8(uint8_t * const dst, const short * const src, int32_t stride)
{
#pragma TCS_no_caller_save
int tmp0, tmp1, tmp2, tmp3;
int i0, i1;
const int stride_scale_4 = stride>>2;
long *restrict dstptr;
long *restrict srcptr;
dstptr = (long*)dst;
srcptr = (long*)src;
ADD16TO8(); ADD16TO8(); ADD16TO8(); ADD16TO8();
ADD16TO8(); ADD16TO8(); ADD16TO8(); ADD16TO8s();
/*
int32_t i, j;
for (j = 0; j < 8; j++)
{
for (i = 0; i < 8; i++)
{
int16_t pixel = (int16_t) dst[j * stride + i] + src[j * 8 + i];
if (pixel < 0)
{
pixel = 0;
}
else if (pixel > 255)
{
pixel = 255;
}
dst[j * stride + i] = (uint8_t) pixel;
}
}
// */
}
#define COPY8TO16() \
tmp0 = (srcptr)[0]; \
tmp1 = (srcptr)[1]; \
(dstptr)[0] = MERGE_LSB(ZERO,tmp0); \
(dstptr)[1] = MERGE_MSB(ZERO,tmp0); \
(dstptr)[2] = MERGE_LSB(ZERO,tmp1); \
(dstptr)[3] = MERGE_MSB(ZERO,tmp1); \
dstptr += 4; \
srcptr += stride_scale_4;
#define COPY8TO16s() \
tmp0 = (srcptr)[0]; \
tmp1 = (srcptr)[1]; \
(dstptr)[0] = MERGE_LSB(ZERO,tmp0); \
(dstptr)[1] = MERGE_MSB(ZERO,tmp0); \
(dstptr)[2] = MERGE_LSB(ZERO,tmp1); \
(dstptr)[3] = MERGE_MSB(ZERO,tmp1);
static __inline void copy8to16_8x8(short * const dst, const uint8_t* const src, int32_t stride)
{
#pragma TCS_no_caller_save
int tmp0, tmp1;
const int stride_scale_4 = stride>>2;
long *restrict dstptr;
long *restrict srcptr;
dstptr = (long*)dst;
srcptr = (long*)src;
COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16();
COPY8TO16(); COPY8TO16(); COPY8TO16(); COPY8TO16s();
/*
int32_t i, j;
for (j = 0; j < 8; j++)
{
for (i = 0; i < 8; i++)
dst[j * 8 + i] = (int16_t) src[j * stride + i];
}
// */
}
#define SUB8TO16_1() \
tmp2 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
tmp3 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
curptr += stride_scale_4; \
refptr += stride_scale_4; \
dstptr += 4;
#define SUB8TO16_1s() \
tmp2 = FUNSHIFT3((refptr)[1], (refptr)[0]); \
tmp3 = FUNSHIFT3((refptr)[2], (refptr)[1]); \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));
#define SUB8TO16_2() \
tmp2 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
tmp3 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
curptr += stride_scale_4; \
refptr += stride_scale_4; \
dstptr += 4;
#define SUB8TO16_2s() \
tmp2 = FUNSHIFT2((refptr)[1], (refptr)[0]); \
tmp3 = FUNSHIFT2((refptr)[2], (refptr)[1]); \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));
#define SUB8TO16_3() \
tmp2 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
tmp3 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
curptr += stride_scale_4; \
refptr += stride_scale_4; \
dstptr += 4;
#define SUB8TO16_3s() \
tmp2 = FUNSHIFT1((refptr)[1], (refptr)[0]); \
tmp3 = FUNSHIFT1((refptr)[2], (refptr)[1]); \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));
#define SUB8TO16() \
tmp2 = (refptr)[0]; \
tmp3 = (refptr)[1]; \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3)); \
curptr += stride_scale_4; \
refptr += stride_scale_4; \
dstptr += 4;
#define SUB8TO16s() \
tmp2 = (refptr)[0]; \
tmp3 = (refptr)[1]; \
tmp0 = (curptr)[0]; \
tmp1 = (curptr)[1]; \
(curptr)[0] = tmp2; \
(curptr)[1] = tmp3; \
(dstptr)[0] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp0),MERGE_LSB(ZERO,tmp2)); \
(dstptr)[1] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp0),MERGE_MSB(ZERO,tmp2)); \
(dstptr)[2] = DSPIDUALSUB(MERGE_LSB(ZERO,tmp1),MERGE_LSB(ZERO,tmp3)); \
(dstptr)[3] = DSPIDUALSUB(MERGE_MSB(ZERO,tmp1),MERGE_MSB(ZERO,tmp3));
static __inline void sub8to16_8x8(short * const dst, uint8_t * const cur, const uint8_t* ref, const int32_t stride)
{
#pragma TCS_no_caller_save
int tmp0, tmp1, tmp2, tmp3;
const int stride_scale_4 = stride>>2;
int ref1 = (int) ref & ~3;
int ref2 = (int) ref & 3;
long *restrict dstptr;
long *restrict curptr;
long *restrict refptr;
dstptr = (long*)dst;
curptr = (long*)cur;
refptr = (long*)ref1;
switch (ref2)
{
case 1:
SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1();
SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1(); SUB8TO16_1s();
return;
case 2:
SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2();
SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2(); SUB8TO16_2s();
return;
case 3:
SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3();
SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3(); SUB8TO16_3s();
return;
default:
SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16();
SUB8TO16(); SUB8TO16(); SUB8TO16(); SUB8TO16s();
return;
}
/*
int32_t i, j;
for (j = 0; j < 8; j++)
{
for (i = 0; i < 8; i++)
{
uint8_t c = cur[j * stride + i];
uint8_t r = ref[j * stride + i];
cur[j * stride + i] = r;
dct[j * 8 + i] = (int16_t) c - (int16_t) r;
}
}
// */
}
static __inline void sub8to16_16x16(int16_t * dst, uint8_t * const cur, const uint8_t * ref, const int32_t stride)
{
int stride8 = stride * 8;
sub8to16_8x8(dst, cur, ref, stride);
sub8to16_8x8(dst+64, cur+8, ref+8, stride);
sub8to16_8x8(dst+128, cur+stride8, ref+stride8, stride);
sub8to16_8x8(dst+192, cur+stride8+8, ref+stride8+8, stride);
}
#endif