应用于 Philips Trimedia 1500 系列 8x8 快速离散余弦变换
#define C0 0xa73b4b42
#define C1 0x11a8cdb7
#define C2 0xcdb7a73b
#define C3 0x4b42ee58
#define C4 0x4b4211a8
#define C5 0x3249a73b
#define C6 0x11a83249
#define C7 0x58c54b42
#define C8 0x40004000
#define C9 0x4000c000
#define C10 0x539e22a3
#define C11 0x22a3ac62
#define HROUND(x) (((x) + (x)) + 0x8000)
#define VROUND(x) ((x) + 0x8000)
#define PACK16_MSB(a, b) PACK16MSB(b, a)
#define READ_NORM(a) ROLI(16, (a))
#define READ_REV(a) (a)
#define horiz_dct(tab, o0, o1, o2, o3, o4, o5, o6, o7) \
tmp0 = READ_NORM((tab)[0]); \
tmp1 = READ_REV ((tab)[1]); \
tmp2 = READ_NORM((tab)[2]); \
tmp3 = READ_REV ((tab)[3]); \
\
tmp101 = DSPIDUALADD(tmp0,tmp3); \
tmp132 = DSPIDUALADD(tmp1,tmp2); \
tmp176 = DSPIDUALSUB(tmp0,tmp3); \
tmp145 = DSPIDUALSUB(tmp1,tmp2); \
\
tmp201 = DSPIDUALADD(tmp101,tmp132); \
tmp232 = DSPIDUALSUB(tmp101,tmp132); \
\
o0 = IFIR16(tmp201,C8); \
o4 = IFIR16(tmp201,C9); \
o2 = IFIR16(tmp232,C10); \
\
o6 = IFIR16(tmp232,C11); \
o7 = IFIR16(tmp145,C0) + IFIR16(tmp176,C1); \
o3 = IFIR16(tmp145,C2) + IFIR16(tmp176,C3); \
o5 = IFIR16(tmp145,C4) + IFIR16(tmp176,C5); \
o1 = IFIR16(tmp145,C6) + IFIR16(tmp176,C7);
#define packltor(o0, o1, o2, o3, o4, o5, o6, o7) \
o0 = PACK16MSB(HROUND(s0), HROUND(t0)); \
o1 = PACK16MSB(HROUND(s1), HROUND(t1)); \
o2 = PACK16MSB(HROUND(s2), HROUND(t2)); \
o3 = PACK16MSB(HROUND(s3), HROUND(t3)); \
o4 = PACK16MSB(HROUND(s4), HROUND(t4)); \
o5 = PACK16MSB(HROUND(s5), HROUND(t5)); \
o6 = PACK16MSB(HROUND(s6), HROUND(t6)); \
o7 = PACK16MSB(HROUND(s7), HROUND(t7));
#define packrtol(o0, o1, o2, o3, o4, o5, o6, o7) \
o0 = PACK16MSB(HROUND(t0), HROUND(s0)); \
o1 = PACK16MSB(HROUND(t1), HROUND(s1)); \
o2 = PACK16MSB(HROUND(t2), HROUND(s2)); \
o3 = PACK16MSB(HROUND(t3), HROUND(s3)); \
o4 = PACK16MSB(HROUND(t4), HROUND(s4)); \
o5 = PACK16MSB(HROUND(t5), HROUND(s5)); \
o6 = PACK16MSB(HROUND(t6), HROUND(s6)); \
o7 = PACK16MSB(HROUND(t7), HROUND(s7));
#define vertical_dct(tmp0, tmp1, tmp2, tmp3, o0, o1, o2, o3, o4, o5, o6, o7) \
\
tmp101 = DSPIDUALADD(tmp0,tmp3); \
tmp132 = DSPIDUALADD(tmp1,tmp2); \
tmp176 = DSPIDUALSUB(tmp0,tmp3); \
tmp145 = DSPIDUALSUB(tmp1,tmp2); \
\
tmp201 = DSPIDUALADD(tmp101,tmp132); \
tmp232 = DSPIDUALSUB(tmp101,tmp132); \
\
o0 = IFIR16(tmp201,C8); \
o4 = IFIR16(tmp201,C9); \
o2 = IFIR16(tmp232,C10); \
o6 = IFIR16(tmp232,C11); \
\
o7 = IFIR16(tmp145,C0) + IFIR16(tmp176,C1); \
o3 = IFIR16(tmp145,C2) + IFIR16(tmp176,C3); \
o5 = IFIR16(tmp145,C4) + IFIR16(tmp176,C5); \
o1 = IFIR16(tmp145,C6) + IFIR16(tmp176,C7);
#define vertical_flush(res) \
(res)[0] = PACK16_MSB(VROUND(s0), VROUND(t0)); \
(res)[4] = PACK16_MSB(VROUND(s1), VROUND(t1)); \
(res)[8] = PACK16_MSB(VROUND(s2), VROUND(t2)); \
(res)[12] = PACK16_MSB(VROUND(s3), VROUND(t3)); \
(res)[16] = PACK16_MSB(VROUND(s4), VROUND(t4)); \
(res)[20] = PACK16_MSB(VROUND(s5), VROUND(t5)); \
(res)[24] = PACK16_MSB(VROUND(s6), VROUND(t6)); \
(res)[28] = PACK16_MSB(VROUND(s7), VROUND(t7));
void fdct(short * restrict block)
{
#pragma TCS_no_caller_save
int tmp0, tmp1, tmp2, tmp3, tmp101, tmp132, tmp176, tmp145, tmp201, tmp232;
int s0, s1, s2, s3, s4, s5, s6, s7;
int t0, t1, t2, t3, t4, t5, t6, t7;
int r00, r02, r04, r06, r10, r12, r14, r16;
int r20, r22, r24, r26, r30, r32, r34, r36;
int r40, r42, r44, r46, r50, r52, r54, r56;
int r60, r62, r64, r66, r70, r72, r74, r76;
long *restrict tab = (long* )block;
horiz_dct(&tab[0], s0, s1, s2, s3, s4, s5, s6, s7);
horiz_dct(&tab[4], t0, t1, t2, t3, t4, t5, t6, t7);
packltor(r00, r02, r04, r06, r10, r12, r14, r16);
horiz_dct(&tab[8], s0, s1, s2, s3, s4, s5, s6, s7);
horiz_dct(&tab[12], t0, t1, t2, t3, t4, t5, t6, t7);
packrtol(r20, r22, r24, r26, r30, r32, r34, r36);
horiz_dct(&tab[16], s0, s1, s2, s3, s4, s5, s6, s7);
horiz_dct(&tab[20], t0, t1, t2, t3, t4, t5, t6, t7);
packltor(r40, r42, r44, r46, r50, r52, r54, r56);
horiz_dct(&tab[24], s0, s1, s2, s3, s4, s5, s6, s7);
horiz_dct(&tab[28], t0, t1, t2, t3, t4, t5, t6, t7);
packrtol(r60, r62, r64, r66, r70, r72, r74, r76);
vertical_dct(r00, r20, r40, r60, s0, s1, s2, s3, s4, s5, s6, s7);
vertical_dct(r02, r22, r42, r62, t0, t1, t2, t3, t4, t5, t6, t7);
vertical_flush(&tab[0]);
vertical_dct(r04, r24, r44, r64, s0, s1, s2, s3, s4, s5, s6, s7);
vertical_dct(r06, r26, r46, r66, t0, t1, t2, t3, t4, t5, t6, t7);
vertical_flush(&tab[1]);
vertical_dct(r10, r30, r50, r70, s0, s1, s2, s3, s4, s5, s6, s7);
vertical_dct(r12, r32, r52, r72, t0, t1, t2, t3, t4, t5, t6, t7);
vertical_flush(&tab[2]);
vertical_dct(r14, r34, r54, r74, s0, s1, s2, s3, s4, s5, s6, s7);
vertical_dct(r16, r36, r56, r76, t0, t1, t2, t3, t4, t5, t6, t7);
vertical_flush(&tab[3]);
}