From 38158dc5dd8e7c62ad2decfec395e3ec2c7e280b Mon Sep 17 00:00:00 2001 From: Brad Midgley Date: Mon, 28 Jan 2008 17:26:22 +0000 Subject: remove 16x16 mult optimization--gcc actually generates more costly code --- sbc/sbc.c | 186 +++++++++++++++++++++++++++---------------------------- sbc/sbc_math.h | 10 ++- sbc/sbc_tables.h | 8 +-- 3 files changed, 101 insertions(+), 103 deletions(-) (limited to 'sbc') diff --git a/sbc/sbc.c b/sbc/sbc.c index c9ea5b5c..97614fb9 100644 --- a/sbc/sbc.c +++ b/sbc/sbc.c @@ -719,47 +719,47 @@ static inline void _sbc_analyze_four(const int16_t *in, int32_t *out) MULA(res, _sbc_proto_4[1], in[16] - in[24]); t[0] = SCALE4_STAGE1(res); /* Q8 */ - MUL32(res, _sbc_proto_4[2], in[1]); - MULA32(res, _sbc_proto_4[3], in[9]); - MULA32(res, _sbc_proto_4[4], in[17]); - MULA32(res, _sbc_proto_4[5], in[25]); - MULA32(res, _sbc_proto_4[6], in[33]); + MUL(res, _sbc_proto_4[2], in[1]); + MULA(res, _sbc_proto_4[3], in[9]); + MULA(res, _sbc_proto_4[4], in[17]); + MULA(res, _sbc_proto_4[5], in[25]); + MULA(res, _sbc_proto_4[6], in[33]); t[1] = SCALE4_STAGE1(res); - MUL32(res, _sbc_proto_4[7], in[2]); - MULA32(res, _sbc_proto_4[8], in[10]); - MULA32(res, _sbc_proto_4[9], in[18]); - MULA32(res, _sbc_proto_4[10], in[26]); - MULA32(res, _sbc_proto_4[11], in[34]); + MUL(res, _sbc_proto_4[7], in[2]); + MULA(res, _sbc_proto_4[8], in[10]); + MULA(res, _sbc_proto_4[9], in[18]); + MULA(res, _sbc_proto_4[10], in[26]); + MULA(res, _sbc_proto_4[11], in[34]); t[2] = SCALE4_STAGE1(res); - MUL32(res, _sbc_proto_4[12], in[3]); - MULA32(res, _sbc_proto_4[13], in[11]); - MULA32(res, _sbc_proto_4[14], in[19]); - MULA32(res, _sbc_proto_4[15], in[27]); - MULA32(res, _sbc_proto_4[16], in[35]); + MUL(res, _sbc_proto_4[12], in[3]); + MULA(res, _sbc_proto_4[13], in[11]); + MULA(res, _sbc_proto_4[14], in[19]); + MULA(res, _sbc_proto_4[15], in[27]); + MULA(res, _sbc_proto_4[16], in[35]); t[3] = SCALE4_STAGE1(res); MUL(res, _sbc_proto_4[17], in[4] + in[36]); MULA(res, _sbc_proto_4[18], in[12] + in[28]); - MULA32(res, _sbc_proto_4[19], in[20]); + MULA(res, _sbc_proto_4[19], in[20]); t[4] = SCALE4_STAGE1(res); - MUL32(res, _sbc_proto_4[16], in[5]); - MULA32(res, _sbc_proto_4[15], in[13]); - MULA32(res, _sbc_proto_4[14], in[21]); - MULA32(res, _sbc_proto_4[13], in[29]); - MULA32(res, _sbc_proto_4[12], in[37]); + MUL(res, _sbc_proto_4[16], in[5]); + MULA(res, _sbc_proto_4[15], in[13]); + MULA(res, _sbc_proto_4[14], in[21]); + MULA(res, _sbc_proto_4[13], in[29]); + MULA(res, _sbc_proto_4[12], in[37]); t[5] = SCALE4_STAGE1(res); /* don't compute t[6]... this term always multiplies * with cos(pi/2) = 0 */ - MUL32(res, _sbc_proto_4[6], in[7]); - MULA32(res, _sbc_proto_4[5], in[15]); - MULA32(res, _sbc_proto_4[4], in[23]); - MULA32(res, _sbc_proto_4[3], in[31]); - MULA32(res, _sbc_proto_4[2], in[39]); + MUL(res, _sbc_proto_4[6], in[7]); + MULA(res, _sbc_proto_4[5], in[15]); + MULA(res, _sbc_proto_4[4], in[23]); + MULA(res, _sbc_proto_4[3], in[31]); + MULA(res, _sbc_proto_4[2], in[39]); t[7] = SCALE4_STAGE1(res); MUL(s[0], _anamatrix4[0], t[0] + t[4]); @@ -800,89 +800,89 @@ static inline void _sbc_analyze_eight(const int16_t *in, int32_t *out) sbc_fixed_t t[8]; sbc_extended_t s[8]; - MUL32(res, _sbc_proto_8[0], (in[16] - in[64])); /* Q18 = Q18 * Q0 */ - MULA32(res, _sbc_proto_8[1], (in[32] - in[48])); - MULA32(res, _sbc_proto_8[2], in[4]); - MULA32(res, _sbc_proto_8[3], in[20]); - MULA32(res, _sbc_proto_8[4], in[36]); - MULA32(res, _sbc_proto_8[5], in[52]); + MUL(res, _sbc_proto_8[0], (in[16] - in[64])); /* Q18 = Q18 * Q0 */ + MULA(res, _sbc_proto_8[1], (in[32] - in[48])); + MULA(res, _sbc_proto_8[2], in[4]); + MULA(res, _sbc_proto_8[3], in[20]); + MULA(res, _sbc_proto_8[4], in[36]); + MULA(res, _sbc_proto_8[5], in[52]); t[0] = SCALE8_STAGE1(res); /* Q10 */ - MUL32(res, _sbc_proto_8[6], in[2]); - MULA32(res, _sbc_proto_8[7], in[18]); - MULA32(res, _sbc_proto_8[8], in[34]); - MULA32(res, _sbc_proto_8[9], in[50]); - MULA32(res, _sbc_proto_8[10], in[66]); + MUL(res, _sbc_proto_8[6], in[2]); + MULA(res, _sbc_proto_8[7], in[18]); + MULA(res, _sbc_proto_8[8], in[34]); + MULA(res, _sbc_proto_8[9], in[50]); + MULA(res, _sbc_proto_8[10], in[66]); t[1] = SCALE8_STAGE1(res); - MUL32(res, _sbc_proto_8[11], in[1]); - MULA32(res, _sbc_proto_8[12], in[17]); - MULA32(res, _sbc_proto_8[13], in[33]); - MULA32(res, _sbc_proto_8[14], in[49]); - MULA32(res, _sbc_proto_8[15], in[65]); - MULA32(res, _sbc_proto_8[16], in[3]); - MULA32(res, _sbc_proto_8[17], in[19]); - MULA32(res, _sbc_proto_8[18], in[35]); - MULA32(res, _sbc_proto_8[19], in[51]); - MULA32(res, _sbc_proto_8[20], in[67]); + MUL(res, _sbc_proto_8[11], in[1]); + MULA(res, _sbc_proto_8[12], in[17]); + MULA(res, _sbc_proto_8[13], in[33]); + MULA(res, _sbc_proto_8[14], in[49]); + MULA(res, _sbc_proto_8[15], in[65]); + MULA(res, _sbc_proto_8[16], in[3]); + MULA(res, _sbc_proto_8[17], in[19]); + MULA(res, _sbc_proto_8[18], in[35]); + MULA(res, _sbc_proto_8[19], in[51]); + MULA(res, _sbc_proto_8[20], in[67]); t[2] = SCALE8_STAGE1(res); - MUL32(res, _sbc_proto_8[21], in[5]); - MULA32(res, _sbc_proto_8[22], in[21]); - MULA32(res, _sbc_proto_8[23], in[37]); - MULA32(res, _sbc_proto_8[24], in[53]); - MULA32(res, _sbc_proto_8[25], in[69]); - MULA32(res, -_sbc_proto_8[15], in[15]); - MULA32(res, -_sbc_proto_8[14], in[31]); - MULA32(res, -_sbc_proto_8[13], in[47]); - MULA32(res, -_sbc_proto_8[12], in[63]); - MULA32(res, -_sbc_proto_8[11], in[79]); + MUL(res, _sbc_proto_8[21], in[5]); + MULA(res, _sbc_proto_8[22], in[21]); + MULA(res, _sbc_proto_8[23], in[37]); + MULA(res, _sbc_proto_8[24], in[53]); + MULA(res, _sbc_proto_8[25], in[69]); + MULA(res, -_sbc_proto_8[15], in[15]); + MULA(res, -_sbc_proto_8[14], in[31]); + MULA(res, -_sbc_proto_8[13], in[47]); + MULA(res, -_sbc_proto_8[12], in[63]); + MULA(res, -_sbc_proto_8[11], in[79]); t[3] = SCALE8_STAGE1(res); - MUL32(res, _sbc_proto_8[26], in[6]); - MULA32(res, _sbc_proto_8[27], in[22]); - MULA32(res, _sbc_proto_8[28], in[38]); - MULA32(res, _sbc_proto_8[29], in[54]); - MULA32(res, _sbc_proto_8[30], in[70]); - MULA32(res, -_sbc_proto_8[10], in[14]); - MULA32(res, -_sbc_proto_8[9], in[30]); - MULA32(res, -_sbc_proto_8[8], in[46]); - MULA32(res, -_sbc_proto_8[7], in[62]); - MULA32(res, -_sbc_proto_8[6], in[78]); + MUL(res, _sbc_proto_8[26], in[6]); + MULA(res, _sbc_proto_8[27], in[22]); + MULA(res, _sbc_proto_8[28], in[38]); + MULA(res, _sbc_proto_8[29], in[54]); + MULA(res, _sbc_proto_8[30], in[70]); + MULA(res, -_sbc_proto_8[10], in[14]); + MULA(res, -_sbc_proto_8[9], in[30]); + MULA(res, -_sbc_proto_8[8], in[46]); + MULA(res, -_sbc_proto_8[7], in[62]); + MULA(res, -_sbc_proto_8[6], in[78]); t[4] = SCALE8_STAGE1(res); - MUL32(res, _sbc_proto_8[31], in[7]); - MULA32(res, _sbc_proto_8[32], in[23]); - MULA32(res, _sbc_proto_8[33], in[39]); - MULA32(res, _sbc_proto_8[34], in[55]); - MULA32(res, _sbc_proto_8[35], in[71]); - MULA32(res, -_sbc_proto_8[20], in[13]); - MULA32(res, -_sbc_proto_8[19], in[29]); - MULA32(res, -_sbc_proto_8[18], in[45]); - MULA32(res, -_sbc_proto_8[17], in[61]); - MULA32(res, -_sbc_proto_8[16], in[77]); + MUL(res, _sbc_proto_8[31], in[7]); + MULA(res, _sbc_proto_8[32], in[23]); + MULA(res, _sbc_proto_8[33], in[39]); + MULA(res, _sbc_proto_8[34], in[55]); + MULA(res, _sbc_proto_8[35], in[71]); + MULA(res, -_sbc_proto_8[20], in[13]); + MULA(res, -_sbc_proto_8[19], in[29]); + MULA(res, -_sbc_proto_8[18], in[45]); + MULA(res, -_sbc_proto_8[17], in[61]); + MULA(res, -_sbc_proto_8[16], in[77]); t[5] = SCALE8_STAGE1(res); MUL(res, _sbc_proto_8[36], in[8] + in[72]); MULA(res, _sbc_proto_8[37], in[24] + in[56]); - MULA32(res, _sbc_proto_8[38], in[40]); - MULA32(res, -_sbc_proto_8[39], in[12]); - MULA32(res, -_sbc_proto_8[5], in[28]); - MULA32(res, -_sbc_proto_8[4], in[44]); - MULA32(res, -_sbc_proto_8[3], in[60]); - MULA32(res, -_sbc_proto_8[2], in[76]); + MULA(res, _sbc_proto_8[38], in[40]); + MULA(res, -_sbc_proto_8[39], in[12]); + MULA(res, -_sbc_proto_8[5], in[28]); + MULA(res, -_sbc_proto_8[4], in[44]); + MULA(res, -_sbc_proto_8[3], in[60]); + MULA(res, -_sbc_proto_8[2], in[76]); t[6] = SCALE8_STAGE1(res); - MUL32(res, _sbc_proto_8[35], in[9]); - MULA32(res, _sbc_proto_8[34], in[25]); - MULA32(res, _sbc_proto_8[33], in[41]); - MULA32(res, _sbc_proto_8[32], in[57]); - MULA32(res, _sbc_proto_8[31], in[73]); - MULA32(res, -_sbc_proto_8[25], in[11]); - MULA32(res, -_sbc_proto_8[24], in[27]); - MULA32(res, -_sbc_proto_8[23], in[43]); - MULA32(res, -_sbc_proto_8[22], in[59]); - MULA32(res, -_sbc_proto_8[21], in[75]); + MUL(res, _sbc_proto_8[35], in[9]); + MULA(res, _sbc_proto_8[34], in[25]); + MULA(res, _sbc_proto_8[33], in[41]); + MULA(res, _sbc_proto_8[32], in[57]); + MULA(res, _sbc_proto_8[31], in[73]); + MULA(res, -_sbc_proto_8[25], in[11]); + MULA(res, -_sbc_proto_8[24], in[27]); + MULA(res, -_sbc_proto_8[23], in[43]); + MULA(res, -_sbc_proto_8[22], in[59]); + MULA(res, -_sbc_proto_8[21], in[75]); t[7] = SCALE8_STAGE1(res); MUL(s[0], _anamatrix8[0], t[0]); /* = Q14 * Q10 */ diff --git a/sbc/sbc_math.h b/sbc/sbc_math.h index c8c72c75..625d4dd0 100644 --- a/sbc/sbc_math.h +++ b/sbc/sbc_math.h @@ -31,20 +31,20 @@ #define ASR_64(val, bits) ((-2 >> 1 == -1) ? \ ((long long)(val)) >> (bits) : ((long long) (val)) / (1 << (bits))) -#define SCALE_PROTO4_TBL 16 +#define SCALE_PROTO4_TBL 15 #define SCALE_ANA4_TBL 16 -#define SCALE_PROTO8_TBL 16 +#define SCALE_PROTO8_TBL 15 #define SCALE_ANA8_TBL 16 #define SCALE_SPROTO4_TBL 16 #define SCALE_SPROTO8_TBL 16 #define SCALE_NPROTO4_TBL 10 #define SCALE_NPROTO8_TBL 12 #define SCALE_SAMPLES 14 -#define SCALE4_STAGE1_BITS 9 +#define SCALE4_STAGE1_BITS 10 #define SCALE4_STAGE2_BITS 21 #define SCALE4_STAGED1_BITS 18 #define SCALE4_STAGED2_BITS 23 -#define SCALE8_STAGE1_BITS 7 +#define SCALE8_STAGE1_BITS 8 #define SCALE8_STAGE2_BITS 24 #define SCALE8_STAGED1_BITS 18 #define SCALE8_STAGED2_BITS 23 @@ -64,8 +64,6 @@ typedef long long sbc_extended_t; #define SBC_FIXED_0(val) { val = 0; } #define ADD(dst, src) { dst += src; } #define SUB(dst, src) { dst -= src; } -#define MUL32(dst, a, b) { dst = (sbc_fixed_t) (a) * (b); } -#define MULA32(dst, a, b) { dst += (sbc_fixed_t) (a) * (b); } #define MUL(dst, a, b) { dst = (sbc_extended_t) (a) * (b); } #define MULA(dst, a, b) { dst += (sbc_extended_t) (a) * (b); } #define DIV2(dst, src) { dst = ASR(src, 1); } diff --git a/sbc/sbc_tables.h b/sbc/sbc_tables.h index 0da2a2d3..5e00caca 100644 --- a/sbc/sbc_tables.h +++ b/sbc/sbc_tables.h @@ -48,7 +48,7 @@ static const int sbc_offset8[4][8] = { #define SN4(val) ASR(val, SCALE_NPROTO4_TBL) #define SN8(val) ASR(val, SCALE_NPROTO8_TBL) -static const int16_t _sbc_proto_4[20] = { +static const int32_t _sbc_proto_4[20] = { SP4(0x02cb3e8c), SP4(0x22b63dc0), SP4(0x002329cc), SP4(0x053b7548), SP4(0x31eab940), SP4(0xec1f5e60), SP4(0xff3773a8), SP4(0x0061c5a7), SP4(0x07646680), SP4(0x3f239480), SP4(0xf89f23a8), SP4(0x007a4737), @@ -56,11 +56,11 @@ static const int16_t _sbc_proto_4[20] = { SP4(0x00ff11ca), SP4(0x00fb7991), SP4(0x069fdc58), SP4(0x4b584000) }; -static const int16_t _anamatrix4[4] = { +static const int32_t _anamatrix4[4] = { SA4(0x2d413cc0), SA4(0x3b20d780), SA4(0x40000000), SA4(0x187de2a0) }; -static const int16_t _sbc_proto_8[40] = { +static const int32_t _sbc_proto_8[40] = { SP8(0x02e5cd20), SP8(0x22d0c200), SP8(0x006bfe27), SP8(0x07808930), SP8(0x3f1c8800), SP8(0xf8810d70), SP8(0x002cfdc6), SP8(0x055acf28), SP8(0x31f566c0), SP8(0xebfe57e0), SP8(0xff27c437), SP8(0x001485cc), @@ -115,7 +115,7 @@ static const int32_t sbc_proto_8_80m1[] = { SS8(0x0d9daee0), SS8(0xeac182c0), SS8(0xfdf1c8d4), SS8(0xfff5bd1a) }; -static const int16_t _anamatrix8[8] = { +static const int32_t _anamatrix8[8] = { SA8(0x3b20d780), SA8(0x187de2a0), SA8(0x3ec52f80), SA8(0x3536cc40), SA8(0x238e7680), SA8(0x0c7c5c20), SA8(0x2d413cc0), SA8(0x40000000) }; -- cgit