From 7fbcefdd87d222c04023dcb1b3ff3cf5aeae8613 Mon Sep 17 00:00:00 2001
From: Brad Midgley <bmidgley@xmission.com>
Date: Thu, 4 Oct 2007 18:28:17 +0000
Subject: enable 4 subband lower half optimizations

---
 sbc/sbc.c | 55 +++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/sbc/sbc.c b/sbc/sbc.c
index dcff5382..a5cbf803 100644
--- a/sbc/sbc.c
+++ b/sbc/sbc.c
@@ -700,12 +700,7 @@ static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
 
 	sbc_extended_t res;
 	sbc_extended_t t[8];
-
-#if 0
-	/* temporary results */
-	sbc_extended_t s[2], p[6], d[4];
-#endif
-	out[0] = out[1] = out[2] = out[3] = 0;
+	sbc_extended_t s[4];
 
 	MUL(res, _sbc_proto_4[0], (in[8] - in[32])); // Q18
 	MULA(res, _sbc_proto_4[1], (in[16] - in[24]));
@@ -744,14 +739,15 @@ static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
 	MULA(res, _sbc_proto_4[13], in[29]);
 	MULA(res, _sbc_proto_4[12], in[37]);
 	t[5] = SCALE4_STAGE1(res);
-
+#if 0
+	/* don't compute... this term always multiplies with cos(pi) = 0*/
 	MUL(res, _sbc_proto_4[11], in[6]);
 	MULA(res, _sbc_proto_4[10], in[14]);
 	MULA(res, _sbc_proto_4[9], in[22]);
 	MULA(res, _sbc_proto_4[8], in[30]);
 	MULA(res, _sbc_proto_4[7], in[38]);
 	t[6] = SCALE4_STAGE1(res);
-
+#endif
 	MUL(res, _sbc_proto_4[6], in[7]);
 	MULA(res, _sbc_proto_4[5], in[15]);
 	MULA(res, _sbc_proto_4[4], in[23]);
@@ -759,7 +755,7 @@ static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
 	MULA(res, _sbc_proto_4[2], in[39]);
 	t[7] = SCALE4_STAGE1(res);
 
-#if 1
+#if 0
 	MUL(res, _anamatrix4[0], t[0]);
 	MULA(res, _anamatrix4[1], t[1]);
 	MULA(res, _anamatrix4[2], t[2]);
@@ -796,24 +792,24 @@ static inline void _sbc_analyze_four(const int32_t *in, int32_t *out)
 	MULA(res, -_anamatrix4[3], t[5]);
 	MULA(res, _anamatrix4[3], t[7]);
 	out[3] = SCALE4_STAGE2(res);
-#endif
-#if 0
-	s[0] = t[1] + t[3];
-	s[1] = t[5] - t[6];
-	MUL(p[0], _anamatrix4[0], t[0] + t[4]);
-	MUL(p[1], _anamatrix4[1], s[0]);
-	MUL(p[2], _anamatrix4[2], t[2]);
-	MUL(p[3], _anamatrix4[3], s[0]);
-	MUL(p[4], _anamatrix4[3], s[1]);
-	MUL(p[5], _anamatrix4[1], s[1]);
-	d[0] = p[0] + p[2];
-	d[1] = p[2] - p[0];
-	d[2] = p[1] + p[4];
-	d[3] = p[3] - p[5];
-	out[0] = SCALE4_STAGE2(d[0] + d[2]);
-	out[1] = SCALE4_STAGE2(d[1] + d[3]);
-	out[2] = SCALE4_STAGE2(d[1] - d[3]);
-	out[3] = SCALE4_STAGE2(d[0] - d[2]);
+#else
+	/* some of these multiplies could be factored more but something overflows */
+	/* eg replace the first two lines with MUL(s[0], _anamatrix4[0], t[0] + t[4]) */
+	MUL(s[0], _anamatrix4[0], t[0]);
+	MULA(s[0], _anamatrix4[0], t[4]);
+	MUL(s[1], _anamatrix4[2], t[2]);
+	MUL(s[2], _anamatrix4[1], t[1]);
+	MULA(s[2], _anamatrix4[1], t[3]);
+	MULA(s[2], _anamatrix4[3], t[5]);
+	MULA(s[2], -_anamatrix4[3], t[7]);
+	MUL(s[3], _anamatrix4[3], t[1]);
+	MULA(s[3], _anamatrix4[3], t[3]);
+	MULA(s[3], -_anamatrix4[1], t[5]);
+	MULA(s[3], _anamatrix4[1], t[7]);
+	out[0] = SCALE4_STAGE2( s[0] + s[1] + s[2]);
+	out[1] = SCALE4_STAGE2(-s[0] + s[1] + s[3]);
+	out[2] = SCALE4_STAGE2(-s[0] + s[1] - s[3]);
+	out[3] = SCALE4_STAGE2( s[0] + s[1] - s[2]);
 #endif
 }
 static inline void sbc_analyze_four(struct sbc_encoder_state *state,
@@ -832,12 +828,7 @@ static inline void _sbc_analyze_eight(const int32_t *in, int32_t *out)
 {
 	sbc_extended_t res;
 	sbc_extended_t t[8];
-
-#if 1
-	/* temporary results */
 	sbc_extended_t s[8];
-#endif
-	out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = 0;
 	
 	MUL(res,  _sbc_proto_8[0], (in[16] - in[64])); // Q18 = Q18 * Q0
 	MULA(res, _sbc_proto_8[1], (in[32] - in[48]));
-- 
cgit