应用于 Philips Trimedia 1500 系列SOC芯片上的8x8快速离散余弦反变换
#define MASK1 0x89be30fc
#define MASK2 0x30fc7642
#define MASK3 0x5a825a82
#define MASK4 0xa57e5a82
#define D0 0x6a6e18f9
#define D1 0x8276b8e3
#define D2 0x18f9471d
#define D3 0x6a6e8276
#define D4 0x82766a6e
#define D5 0xb8e3e707
#define D6 0x471d7d8a
#define D7 0x18f96a6e
#define C0 0x6a6e8276
#define C1 0xb8e318f9
#define C2 0x8276b8e3
#define C3 0xe7076a6e
#define C4 0x18f96a6e
#define C5 0x8276471d
#define C6 0x471d18f9
#define C7 0x6a6e7d8a
#define C8 0x5a825a82
#define C9 0xa57e5a82
#define C10 0x30fc7642
#define C11 0x89be30fc
#define SCALED_COEFFS 1
#define EIGHT_BIT_SAMPLES 0
#define PASS1_BITS 1+(EIGHT_BIT_SAMPLES)
#define CONST_BITS2 14
#define TMP_20_21_H_BIAS 0x8000
#define H_ROUNDING (32 << (16*!LITTLE_ENDIAN))
#define UPSCL 3
#define DWNSCL CONST_BITS2+PASS1_BITS+SCALED_COEFFS+3-16
#if defined(__LITTLE_ENDIAN__)
#define PACK16_MSB(a, b) PACK16MSB(b, a)
#define PACK16_LSB(a, b) PACK16LSB(b, a)
#else
#define PACK16_MSB(a, b) PACK16MSB(a, b)
#define PACK16_LSB(a, b) PACK16LSB(a, b)
#endif
custom_op long dualasr(long a, unsigned long b);
custom_op long dualiclipi(long a, unsigned long b);
#define pack16_msb(a, b) PACK16_MSB(a, b)
#define combinePred(dct0, dct1, dct2, dct3, pred1, pred2) \
pred1 = dualiclipi(dualasr(pack16_msb(dct0, dct1), DWNSCL), clip); \
pred2 = dualiclipi(dualasr(pack16_msb(dct2, dct3), DWNSCL), clip); \
#define horiz_idct(data, offset, r0, r1, r2, r3, r4, r5, r6, r7, comp) \
\
z0 = PACK16_LSB(data[offset+ 0] << UPSCL, data[offset+32] << UPSCL) ; \
z2 = PACK16_LSB(data[offset+ 8] << UPSCL, data[offset+40] << UPSCL) ; \
z5 = PACK16_LSB(data[offset+16] << UPSCL, data[offset+48] << UPSCL) ; \
z3 = PACK16_LSB(data[offset+24] << UPSCL, data[offset+56] << UPSCL) ; \
zz0 = PACK16_LSB(data[offset+ 4] << UPSCL,data[offset+36] << UPSCL) ; \
zz2 = PACK16_LSB(data[offset+12] << UPSCL,data[offset+44] << UPSCL) ; \
zz5 = PACK16_LSB(data[offset+20] << UPSCL,data[offset+52] << UPSCL) ; \
zz3 = PACK16_LSB(data[offset+28] << UPSCL,data[offset+60] << UPSCL) ; \
\
tmp22 = IFIR16(z5, MASK1); \
tmp23 = IFIR16(z5, MASK2); \
tmp20 = IFIR16(z0, MASK3) + TMP_20_21_H_BIAS; \
tmp21 = IFIR16(z0, MASK4) + TMP_20_21_H_BIAS; \
\
tmp10 = tmp20 + tmp23; \
tmp13 = tmp20 - tmp23; \
tmp11 = tmp21 + tmp22; \
tmp12 = tmp21 - tmp22; \
\
tmp0 = IFIR16(z2, D0) + IFIR16(z3, D1); \
tmp1 = IFIR16(z2, D2) + IFIR16(z3, D3); \
tmp2 = IFIR16(z2, D4) + IFIR16(z3, D5); \
tmp3 = IFIR16(z2, D6) + IFIR16(z3, D7); \
\
temp22 = IFIR16(zz5, MASK1); \
temp23 = IFIR16(zz5, MASK2); \
temp20 = IFIR16(zz0, MASK3) + TMP_20_21_H_BIAS; \
temp21 = IFIR16(zz0, MASK4) + TMP_20_21_H_BIAS; \
\
temp10 = temp20 + temp23; \
temp13 = temp20 - temp23; \
temp11 = temp21 + temp22; \
temp12 = temp21 - temp22; \
\
\
temp0 = IFIR16(zz2, D0) + IFIR16(zz3, D1); \
temp1 = IFIR16(zz2, D2) + IFIR16(zz3, D3); \
temp2 = IFIR16(zz2, D4) + IFIR16(zz3, D5); \
temp3 = IFIR16(zz2, D6) + IFIR16(zz3, D7); \
\
\
r0 = PACK16_MSB(tmp10 + tmp3, temp10 + temp3); \
r1 = PACK16_MSB(tmp11 + tmp2, temp11 + temp2); \
r2 = PACK16_MSB(tmp12 + tmp1, temp12 + temp1); \
r3 = PACK16_MSB(tmp13 + tmp0, temp13 + temp0); \
r4 = PACK16_MSB(tmp13 - tmp0, temp13 - temp0); \
r5 = PACK16_MSB(tmp12 - tmp1, temp12 - temp1); \
r6 = PACK16_MSB(tmp11 - tmp2, temp11 - temp2); \
r7 = PACK16_MSB(tmp10 - tmp3, temp10 - temp3); \
#define vertical_idct(r0, r1, r2, r3, dest1, dest2, dest3, dest4) \
\
tmp22 = IFIR16(r2, MASK1); \
tmp23 = IFIR16(r2, MASK2); \
tmp20 = IFIR16(r0, MASK3); \
tmp21 = IFIR16(r0, MASK4); \
\
tmp10 = tmp20 + tmp23; \
tmp13 = tmp20 - tmp23; \
tmp11 = tmp21 + tmp22; \
tmp12 = tmp21 - tmp22; \
\
tmp0 = IFIR16(r1, D0) + IFIR16(r3, D1); \
tmp1 = IFIR16(r1, D2) + IFIR16(r3, D3); \
tmp2 = IFIR16(r1, D4) + IFIR16(r3, D5); \
tmp3 = IFIR16(r1, D6) + IFIR16(r3, D7); \
\
combinePred(tmp10 + tmp3 + rd, tmp11 + tmp2 + rd, tmp12 + tmp1 + rd, \
tmp13 + tmp0 + rd, dest1, dest2) \
combinePred(tmp13 - tmp0 + rd, tmp12 - tmp1 + rd, tmp11 - tmp2 + rd, \
tmp10 - tmp3 + rd, dest3, dest4 ) ;
void idct(short * restrict datain)
{
int r00, r01, r02, r03;
int r10, r11, r12, r13;
int r20, r21, r22, r23;
int r30, r31, r32, r33;
int r40, r41, r42, r43;
int r50, r51, r52, r53;
int r60, r61, r62, r63;
int r70, r71, r72, r73;
int tmp0, tmp1, tmp2, tmp3;
int tmp10, tmp11, tmp12, tmp13;
int tmp20, tmp21, tmp22, tmp23;
int temp0, temp1, temp2, temp3;
int temp10, temp11, temp12, temp13;
int temp20, temp21, temp22, temp23;
int z0, z2, z3, z5;
int zz0, zz2, zz3, zz5;
long * restrict dataout;
int rd;
const unsigned int clip = 2047;
#pragma TCS_no_caller_save
rd = 1 << (DWNSCL + 15);
dataout = (long *)datain;
horiz_idct(datain, 0, r00, r01, r02, r03, r40, r41, r42, r43, H_ROUNDING);
horiz_idct(datain, 1, r10, r11, r12, r13, r50, r51, r52, r53, 0);
horiz_idct(datain, 2, r20, r21, r22, r23, r60, r61, r62, r63, 0);
horiz_idct(datain, 3, r30, r31, r32, r33, r70, r71, r72, r73,0);
vertical_idct(r00, r10, r20, r30, dataout[0], dataout[1], dataout[2], dataout[3]);
vertical_idct(r01, r11, r21, r31, dataout[4], dataout[5], dataout[6], dataout[7]);
vertical_idct(r02, r12, r22, r32, dataout[8], dataout[9], dataout[10], dataout[11]);
vertical_idct(r03, r13, r23, r33, dataout[12], dataout[13], dataout[14], dataout[15]);
vertical_idct(r40, r50, r60, r70, dataout[16], dataout[17], dataout[18], dataout[19]);
vertical_idct(r41, r51, r61, r71, dataout[20], dataout[21], dataout[22], dataout[23]);
vertical_idct(r42, r52, r62, r72, dataout[24], dataout[25], dataout[26], dataout[27]);
vertical_idct(r43, r53, r63, r73, dataout[28], dataout[29], dataout[30], dataout[31]);
}