应用于 Philips Trimedia 1500系列SOC MPEG4视频编解码算法中,半像素插值快速算法
#ifndef _INTERPOLATE8X8_H_
#define _INTERPOLATE8X8_H_
#define HALFPEL_H_RND0() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t0 = PACKBYTES(src1[2], src1[1]); \
t1 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[6], src1[5]); \
t3 = PACKBYTES(src1[8], src1[7]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG(s0, q0); \
dstptr[1] = QUADAVG(s1, q1); \
src1 += stride; \
dstptr += stride_scale_4;
#define HALFPEL_H_RND0s() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t0 = PACKBYTES(src1[2], src1[1]); \
t1 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[6], src1[5]); \
t3 = PACKBYTES(src1[8], src1[7]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG(s0, q0); \
dstptr[1] = QUADAVG(s1, q1);
#define HALFPEL_H_RND1() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t0 = PACKBYTES(src1[2], src1[1]); \
t1 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[6], src1[5]); \
t3 = PACKBYTES(src1[8], src1[7]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG0(s0, q0); \
dstptr[1] = QUADAVG0(s1, q1); \
src1 += stride; \
dstptr += stride_scale_4;
#define HALFPEL_H_RND1s() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t0 = PACKBYTES(src1[2], src1[1]); \
t1 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[6], src1[5]); \
t3 = PACKBYTES(src1[8], src1[7]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG0(s0, q0); \
dstptr[1] = QUADAVG0(s1, q1);
static __inline void halfpel_h(uint8_t * const dst, const uint8_t * const src, const int32_t stride, const int32_t rounding)
{
#pragma TCS_no_caller_save
int r0, r1, r2, r3;
int t0, t1, t2, t3;
int s0, s1, q0, q1;
const int stride_scale_4 = stride >> 2;
long *restrict dstptr;
const unsigned char *restrict src1;
dstptr = (long*)dst;
src1 = src;
if(rounding)
{
HALFPEL_H_RND1();
HALFPEL_H_RND1();
HALFPEL_H_RND1();
HALFPEL_H_RND1();
HALFPEL_H_RND1();
HALFPEL_H_RND1();
HALFPEL_H_RND1();
HALFPEL_H_RND1s();
}
else
{
HALFPEL_H_RND0();
HALFPEL_H_RND0();
HALFPEL_H_RND0();
HALFPEL_H_RND0();
HALFPEL_H_RND0();
HALFPEL_H_RND0();
HALFPEL_H_RND0();
HALFPEL_H_RND0s();
}
/*
int32_t j;
int32_t r=1-rounding;
for (j = 0; j < 8*stride; j+=stride)
{
dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] + r )>>1);
dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] + r )>>1);
dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] + r )>>1);
dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] + r )>>1);
dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] + r )>>1);
dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] + r )>>1);
dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] + r )>>1);
dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] + r )>>1);
}
// */
}
#define HALFPEL_V_RND0() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG(s0, q0); \
dstptr[1] = QUADAVG(s1, q1); \
dstptr += stride_scale_4; \
src1 += stride;
#define HALFPEL_V_RND0s() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG(s0, q0); \
dstptr[1] = QUADAVG(s1, q1);
#define HALFPEL_V_RND1() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG0(s0, q0); \
dstptr[1] = QUADAVG0(s1, q1); \
dstptr += stride_scale_4; \
src1 += stride;
#define HALFPEL_V_RND1s() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[3], src1[2]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[3+stride], src1[2+stride]); \
s0 = PACK16LSB(r1, r0); \
q0 = PACK16LSB(t1, t0); \
r2 = PACKBYTES(src1[5], src1[4]); \
r3 = PACKBYTES(src1[7], src1[6]); \
t2 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t3 = PACKBYTES(src1[7+stride], src1[6+stride]); \
s1 = PACK16LSB(r3, r2); \
q1 = PACK16LSB(t3, t2); \
dstptr[0] = QUADAVG0(s0, q0); \
dstptr[1] = QUADAVG0(s1, q1);
static __inline void halfpel_v(uint8_t * const dst, const uint8_t * const src, const int32_t stride, const int32_t rounding)
{
#pragma TCS_no_caller_save
int r0, r1, r2, r3;
int t0, t1, t2, t3;
int s0, s1, q0, q1;
const int stride_scale_4 = stride >> 2;
long *restrict dstptr;
const unsigned char *restrict src1;
dstptr = (long*)dst;
src1 = src;
if(rounding)
{
HALFPEL_V_RND1();
HALFPEL_V_RND1();
HALFPEL_V_RND1();
HALFPEL_V_RND1();
HALFPEL_V_RND1();
HALFPEL_V_RND1();
HALFPEL_V_RND1();
HALFPEL_V_RND1s();
}
else
{
HALFPEL_V_RND0();
HALFPEL_V_RND0();
HALFPEL_V_RND0();
HALFPEL_V_RND0();
HALFPEL_V_RND0();
HALFPEL_V_RND0();
HALFPEL_V_RND0();
HALFPEL_V_RND0s();
}
/*
int32_t j;
int32_t r=1-rounding;
for (j = 0; j < 8*stride; j+=stride)
{
dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] + r )>>1);
dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] + r )>>1);
dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] + r )>>1);
dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] + r )>>1);
dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] + r )>>1);
dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] + r )>>1);
dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] + r )>>1);
dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] + r )>>1);
}
// */
}
#define HALFPEL_HV_RND0() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[2], src1[1]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
m0 = BILINEAR2(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
r2 = PACKBYTES(src1[3], src1[2]); \
r3 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
m1 = BILINEAR2(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
r4 = PACKBYTES(src1[5], src1[4]); \
r5 = PACKBYTES(src1[6], src1[5]); \
t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
m2 = BILINEAR2(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
r6 = PACKBYTES(src1[7], src1[6]); \
r7 = PACKBYTES(src1[8], src1[7]); \
t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
m3 = BILINEAR2(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
dstptr[0] = MERGEDUAL16LSB(m1, m0); \
dstptr[1] = MERGEDUAL16LSB(m3, m2); \
src1 += stride; \
dstptr += stride_scale_4;
#define HALFPEL_HV_RND0s() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[2], src1[1]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
m0 = BILINEAR2(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
r2 = PACKBYTES(src1[3], src1[2]); \
r3 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
m1 = BILINEAR2(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
r4 = PACKBYTES(src1[5], src1[4]); \
r5 = PACKBYTES(src1[6], src1[5]); \
t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
m2 = BILINEAR2(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
r6 = PACKBYTES(src1[7], src1[6]); \
r7 = PACKBYTES(src1[8], src1[7]); \
t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
m3 = BILINEAR2(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
dstptr[0] = MERGEDUAL16LSB(m1, m0); \
dstptr[1] = MERGEDUAL16LSB(m3, m2);
#define HALFPEL_HV_RND1() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[2], src1[1]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
m0 = BILINEAR1(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
r2 = PACKBYTES(src1[3], src1[2]); \
r3 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
m1 = BILINEAR1(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
r4 = PACKBYTES(src1[5], src1[4]); \
r5 = PACKBYTES(src1[6], src1[5]); \
t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
m2 = BILINEAR1(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
r6 = PACKBYTES(src1[7], src1[6]); \
r7 = PACKBYTES(src1[8], src1[7]); \
t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
m3 = BILINEAR1(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
dstptr[0] = MERGEDUAL16LSB(m1, m0); \
dstptr[1] = MERGEDUAL16LSB(m3, m2); \
src1 += stride; \
dstptr += stride_scale_4;
#define HALFPEL_HV_RND1s() \
r0 = PACKBYTES(src1[1], src1[0]); \
r1 = PACKBYTES(src1[2], src1[1]); \
t0 = PACKBYTES(src1[1+stride], src1[0+stride]); \
t1 = PACKBYTES(src1[2+stride], src1[1+stride]); \
m0 = BILINEAR1(PACK16LSB(r1, r0), PACK16LSB(t1, t0)); \
r2 = PACKBYTES(src1[3], src1[2]); \
r3 = PACKBYTES(src1[4], src1[3]); \
t2 = PACKBYTES(src1[3+stride], src1[2+stride]); \
t3 = PACKBYTES(src1[4+stride], src1[3+stride]); \
m1 = BILINEAR1(PACK16LSB(r3, r2), PACK16LSB(t3, t2)); \
r4 = PACKBYTES(src1[5], src1[4]); \
r5 = PACKBYTES(src1[6], src1[5]); \
t4 = PACKBYTES(src1[5+stride], src1[4+stride]); \
t5 = PACKBYTES(src1[6+stride], src1[5+stride]); \
m2 = BILINEAR1(PACK16LSB(r5, r4), PACK16LSB(t5, t4)); \
r6 = PACKBYTES(src1[7], src1[6]); \
r7 = PACKBYTES(src1[8], src1[7]); \
t6 = PACKBYTES(src1[7+stride], src1[6+stride]); \
t7 = PACKBYTES(src1[8+stride], src1[7+stride]); \
m3 = BILINEAR1(PACK16LSB(r7, r6), PACK16LSB(t7, t6)); \
dstptr[0] = MERGEDUAL16LSB(m1, m0); \
dstptr[1] = MERGEDUAL16LSB(m3, m2);
static __inline void halfpel_hv(uint8_t * const dst, const uint8_t * const src, const int32_t stride, const int32_t rounding)
{
#pragma TCS_no_caller_save
int r0, r1, r2, r3, r4, r5, r6, r7;
int t0, t1, t2, t3, t4, t5, t6, t7;
int m0, m1, m2, m3;
const int stride_scale_4 = stride >> 2;
long *restrict dstptr;
const unsigned char *restrict src1;
dstptr = (long*)dst;
src1 = src;
if(rounding)
{
HALFPEL_HV_RND1();
HALFPEL_HV_RND1();
HALFPEL_HV_RND1();
HALFPEL_HV_RND1();
HALFPEL_HV_RND1();
HALFPEL_HV_RND1();
HALFPEL_HV_RND1();
HALFPEL_HV_RND1s();
}
else
{
HALFPEL_HV_RND0();
HALFPEL_HV_RND0();
HALFPEL_HV_RND0();
HALFPEL_HV_RND0();
HALFPEL_HV_RND0();
HALFPEL_HV_RND0();
HALFPEL_HV_RND0();
HALFPEL_HV_RND0s();
}
/*
int32_t j;
int32_t r=2-rounding;
for (j = 0; j < 8*stride; j+=stride)
{
dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] + r )>>2);
dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] + r )>>2);
dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] + r )>>2);
dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] + r )>>2);
dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] + r )>>2);
dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] + r )>>2);
dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] + r )>>2);
dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] + r )>>2);
}
// */
}
#endif