7 files changed, 271 insertions, 39 deletions
diff --git a/configure.ac b/configure.ac
index bc6c1ce6..be79759c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -393,7 +393,7 @@ AC_FUNC_SELECT_ARGTYPES
 AC_CHECK_FUNCS([chmod chown clock_gettime getaddrinfo getgrgid_r getgrnam_r \
     getpwnam_r getpwuid_r gettimeofday getuid inet_ntop inet_pton mlock nanosleep \
     pipe posix_fadvise posix_madvise posix_memalign setpgid setsid shm_open \
-    sigaction sleep sysconf])
+    sigaction sleep sysconf pthread_setaffinity_np])
 AC_CHECK_FUNCS([mkfifo], [HAVE_MKFIFO=1], [HAVE_MKFIFO=0])
 
 AM_CONDITIONAL(HAVE_MKFIFO, test "x$HAVE_MKFIFO" = "x1")
diff --git a/src/.gitignore b/src/.gitignore
index 72c38cc6..66738d0a 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -58,3 +58,4 @@ thread-test
 utf8-test
 voltest
 start-pulseaudio-x11
+vector-test
diff --git a/src/Makefile.am b/src/Makefile.am
index d77f4dc1..24623d3f 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -232,6 +232,7 @@ TESTS = \
 		strlist-test \
 		close-test \
 		voltest \
+		vector-test \
 		memblockq-test \
 		channelmap-test \
 		thread-mainloop-test \
@@ -262,6 +263,7 @@ TESTS_BINARIES = \
 		strlist-test \
 		close-test \
 		voltest \
+		vector-test \
 		memblockq-test \
 		sync-playback \
 		interpol-test \
@@ -407,6 +409,11 @@ voltest_CFLAGS = $(AM_CFLAGS)
 voltest_LDADD = $(AM_LDADD) libpulse.la
 voltest_LDFLAGS = $(AM_LDFLAGS) $(BINLDFLAGS)
 
+vector_test_SOURCES = tests/vector-test.c
+vector_test_CFLAGS = $(AM_CFLAGS)
+vector_test_LDADD = $(AM_LDADD) libpulsecore-@PA_MAJORMINORMICRO@.la libpulsecommon-@PA_MAJORMINORMICRO@.la
+vector_test_LDFLAGS = $(AM_LDFLAGS) $(BINLDFLAGS)
+
 channelmap_test_SOURCES = tests/channelmap-test.c
 channelmap_test_CFLAGS = $(AM_CFLAGS)
 channelmap_test_LDADD = $(AM_LDADD) libpulse.la
@@ -526,7 +533,7 @@ libpulsecommon_@PA_MAJORMINORMICRO@_la_SOURCES = \
 		pulsecore/llist.h \
 		pulsecore/lock-autospawn.c pulsecore/lock-autospawn.h \
 		pulsecore/log.c pulsecore/log.h \
-		pulsecore/macro.h \
+		pulsecore/macro.h pulsecore/vector.h \
 		pulsecore/mcalign.c pulsecore/mcalign.h \
 		pulsecore/memblock.c pulsecore/memblock.h \
 		pulsecore/memblockq.c pulsecore/memblockq.h \
diff --git a/src/pulsecore/sample-util.c b/src/pulsecore/sample-util.c
index cf7b4d58..905ba5df 100644
--- a/src/pulsecore/sample-util.c
+++ b/src/pulsecore/sample-util.c
@@ -213,13 +213,22 @@ size_t pa_mix(
 
                 for (i = 0; i < nstreams; i++) {
                     pa_mix_info *m = streams + i;
-                    int32_t v, cv = m->linear[channel].i;
+                    int32_t v, lo, hi, cv = m->linear[channel].i;
 
                     if (PA_UNLIKELY(cv <= 0))
                         continue;
 
+                    /* Multiplying the 32bit volume factor with the
+                     * 16bit sample might result in an 48bit value. We
+                     * want to do without 64 bit integers and hence do
+                     * the multiplication independantly for the HI and
+                     * LO part of the volume. */
+
+                    hi = cv >> 16;
+                    lo = cv & 0xFFFF;
+
                     v = *((int16_t*) m->ptr);
-                    v = (v * cv) / 0x10000;
+                    v = ((v * lo) >> 16) + (v * hi);
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + sizeof(int16_t);
@@ -248,13 +257,16 @@ size_t pa_mix(
 
                 for (i = 0; i < nstreams; i++) {
                     pa_mix_info *m = streams + i;
-                    int32_t v, cv = m->linear[channel].i;
+                    int32_t v, lo, hi, cv = m->linear[channel].i;
 
                     if (PA_UNLIKELY(cv <= 0))
                         continue;
 
+                    hi = cv >> 16;
+                    lo = cv & 0xFFFF;
+
                     v = PA_INT16_SWAP(*((int16_t*) m->ptr));
-                    v = (v * cv) / 0x10000;
+                    v = ((v * lo) >> 16) + (v * hi);
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + sizeof(int16_t);
@@ -290,7 +302,7 @@ size_t pa_mix(
                         continue;
 
                     v = *((int32_t*) m->ptr);
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + sizeof(int32_t);
@@ -326,7 +338,7 @@ size_t pa_mix(
                         continue;
 
                     v = PA_INT32_SWAP(*((int32_t*) m->ptr));
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + sizeof(int32_t);
@@ -362,7 +374,7 @@ size_t pa_mix(
                         continue;
 
                     v = (int32_t) (PA_READ24NE(m->ptr) << 8);
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + 3;
@@ -398,7 +410,7 @@ size_t pa_mix(
                         continue;
 
                     v = (int32_t) (PA_READ24RE(m->ptr) << 8);
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + 3;
@@ -434,7 +446,7 @@ size_t pa_mix(
                         continue;
 
                     v = (int32_t) (*((uint32_t*)m->ptr) << 8);
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + sizeof(int32_t);
@@ -470,7 +482,7 @@ size_t pa_mix(
                         continue;
 
                     v = (int32_t) (PA_UINT32_SWAP(*((uint32_t*) m->ptr)) << 8);
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + 3;
@@ -505,7 +517,7 @@ size_t pa_mix(
                         continue;
 
                     v = (int32_t) *((uint8_t*) m->ptr) - 0x80;
-                    v = (v * cv) / 0x10000;
+                    v = (v * cv) >> 16;
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + 1;
@@ -534,13 +546,16 @@ size_t pa_mix(
 
                 for (i = 0; i < nstreams; i++) {
                     pa_mix_info *m = streams + i;
-                    int32_t v, cv = m->linear[channel].i;
+                    int32_t v, hi, lo, cv = m->linear[channel].i;
 
                     if (PA_UNLIKELY(cv <= 0))
                         continue;
 
+                    hi = cv >> 16;
+                    lo = cv & 0xFFFF;
+
                     v = (int32_t) st_ulaw2linear16(*((uint8_t*) m->ptr));
-                    v = (v * cv) / 0x10000;
+                    v = ((v * lo) >> 16) + (v * hi);
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + 1;
@@ -569,13 +584,16 @@ size_t pa_mix(
 
                 for (i = 0; i < nstreams; i++) {
                     pa_mix_info *m = streams + i;
-                    int32_t v, cv = m->linear[channel].i;
+                    int32_t v, hi, lo, cv = m->linear[channel].i;
 
                     if (PA_UNLIKELY(cv <= 0))
                         continue;
 
+                    hi = cv >> 16;
+                    lo = cv & 0xFFFF;
+
                     v = (int32_t) st_alaw2linear16(*((uint8_t*) m->ptr));
-                    v = (v * cv) / 0x10000;
+                    v = ((v * lo) >> 16) + (v * hi);
                     sum += v;
 
                     m->ptr = (uint8_t*) m->ptr + 1;
@@ -710,16 +728,26 @@ void pa_volume_memchunk(
             e = (int16_t*) ptr + c->length/sizeof(int16_t);
 
             for (channel = 0, d = ptr; d < e; d++) {
-                int32_t t;
+                int32_t t, hi, lo;
+
+                /* Multiplying the 32bit volume factor with the 16bit
+                 * sample might result in an 48bit value. We want to
+                 * do without 64 bit integers and hence do the
+                 * multiplication independantly for the HI and LO part
+                 * of the volume. */
+
+                hi = linear[channel] >> 16;
+                lo = linear[channel] & 0xFFFF;
 
                 t = (int32_t)(*d);
-                t = (t * linear[channel]) / 0x10000;
+                t = ((t * lo) >> 16) + (t * hi);
                 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                 *d = (int16_t) t;
 
                 if (PA_UNLIKELY(++channel >= spec->channels))
                     channel = 0;
             }
+
             break;
         }
 
@@ -733,10 +761,13 @@ void pa_volume_memchunk(
             e = (int16_t*) ptr + c->length/sizeof(int16_t);
 
             for (channel = 0, d = ptr; d < e; d++) {
-                int32_t t;
+                int32_t t, hi, lo;
+
+                hi = linear[channel] >> 16;
+                lo = linear[channel] & 0xFFFF;
 
                 t = (int32_t) PA_INT16_SWAP(*d);
-                t = (t * linear[channel]) / 0x10000;
+                t = ((t * lo) >> 16) + (t * hi);
                 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                 *d = PA_INT16_SWAP((int16_t) t);
 
@@ -760,7 +791,7 @@ void pa_volume_memchunk(
                 int64_t t;
 
                 t = (int64_t)(*d);
-                t = (t * linear[channel]) / 0x10000;
+                t = (t * linear[channel]) >> 16;
                 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
                 *d = (int32_t) t;
 
@@ -783,7 +814,7 @@ void pa_volume_memchunk(
                 int64_t t;
 
                 t = (int64_t) PA_INT32_SWAP(*d);
-                t = (t * linear[channel]) / 0x10000;
+                t = (t * linear[channel]) >> 16;
                 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
                 *d = PA_INT32_SWAP((int32_t) t);
 
@@ -806,7 +837,7 @@ void pa_volume_memchunk(
                 int64_t t;
 
                 t = (int64_t)((int32_t) (PA_READ24NE(d) << 8));
-                t = (t * linear[channel]) / 0x10000;
+                t = (t * linear[channel]) >> 16;
                 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
                 PA_WRITE24NE(d, ((uint32_t) (int32_t) t) >> 8);
 
@@ -829,7 +860,7 @@ void pa_volume_memchunk(
                 int64_t t;
 
                 t = (int64_t)((int32_t) (PA_READ24RE(d) << 8));
-                t = (t * linear[channel]) / 0x10000;
+                t = (t * linear[channel]) >> 16;
                 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
                 PA_WRITE24RE(d, ((uint32_t) (int32_t) t) >> 8);
 
@@ -852,7 +883,7 @@ void pa_volume_memchunk(
                 int64_t t;
 
                 t = (int64_t) ((int32_t) (*d << 8));
-                t = (t * linear[channel]) / 0x10000;
+                t = (t * linear[channel]) >> 16;
                 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
                 *d = ((uint32_t) ((int32_t) t)) >> 8;
 
@@ -875,7 +906,7 @@ void pa_volume_memchunk(
                 int64_t t;
 
                 t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*d) << 8));
-                t = (t * linear[channel]) / 0x10000;
+                t = (t * linear[channel]) >> 16;
                 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
                 *d = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
 
@@ -895,10 +926,13 @@ void pa_volume_memchunk(
             e = (uint8_t*) ptr + c->length;
 
             for (channel = 0, d = ptr; d < e; d++) {
-                int32_t t;
+                int32_t t, hi, lo;
+
+                hi = linear[channel] >> 16;
+                lo = linear[channel] & 0xFFFF;
 
                 t = (int32_t) *d - 0x80;
-                t = (t * linear[channel]) / 0x10000;
+                t = ((t * lo) >> 16) + (t * hi);
                 t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
                 *d = (uint8_t) (t + 0x80);
 
@@ -918,10 +952,13 @@ void pa_volume_memchunk(
             e = (uint8_t*) ptr + c->length;
 
             for (channel = 0, d = ptr; d < e; d++) {
-                int32_t t;
+                int32_t t, hi, lo;
+
+                hi = linear[channel] >> 16;
+                lo = linear[channel] & 0xFFFF;
 
                 t = (int32_t) st_ulaw2linear16(*d);
-                t = (t * linear[channel]) / 0x10000;
+                t = ((t * lo) >> 16) + (t * hi);
                 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                 *d = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
 
@@ -941,10 +978,13 @@ void pa_volume_memchunk(
             e = (uint8_t*) ptr + c->length;
 
             for (channel = 0, d = ptr; d < e; d++) {
-                int32_t t;
+                int32_t t, hi, lo;
+
+                hi = linear[channel] >> 16;
+                lo = linear[channel] & 0xFFFF;
 
                 t = (int32_t) st_alaw2linear16(*d);
-                t = (t * linear[channel]) / 0x10000;
+                t = ((t * lo) >> 16) + (t * hi);
                 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
                 *d = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
 
diff --git a/src/pulsecore/vector.h b/src/pulsecore/vector.h
new file mode 100644
index 00000000..076bd6c0
--- /dev/null
+++ b/src/pulsecore/vector.h
@@ -0,0 +1,97 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2004-2006 Lennart Poettering
+  Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published
+  by the Free Software Foundation; either version 2 of the License,
+  or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with PulseAudio; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+***/
+
+#include <inttypes.h>
+
+/* First, define HAVE_VECTOR if we have the gcc vector extensions at all */
+#if defined(__SSE2__) || defined(__ALTIVEC__)
+#define HAVE_VECTOR
+
+
+/* This is supposed to be portable to different SIMD instruction
+ * sets. We define vector types for different base types: uint8_t,
+ * int16_t, int32_t, float. The vector type is a union. The fields .i,
+ * .u, .f are arrays for accessing the separate elements of a
+ * vector. .v is a gcc vector type of the right format. .m is the
+ * vector in the type the SIMD extenstion specific intrinsics API
+ * expects. PA_xxx_VECTOR_SIZE is the size of the
+ * entries. PA_xxxx_VECTOR_MAKE constructs a gcc vector variable with
+ * the same value in all elements. */
+
+#ifdef __SSE2__
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#define PA_UINT8_VECTOR_SIZE 16
+#define PA_INT16_VECTOR_SIZE 8
+#define PA_INT32_VECTOR_SIZE 4
+#define PA_FLOAT_VECTOR_SIZE 4
+
+#define PA_UINT8_VECTOR_MAKE(x) (pa_v16qi) { x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x }
+#define PA_INT16_VECTOR_MAKE(x) (pa_v8hi) { x, x, x, x, x, x, x, x }
+#define PA_INT32_VECTOR_MAKE(x) (pa_v4si) { x, x, x, x }
+#define PA_FLOAT_VECTOR_MAKE(x) (pa_v4fi) { x, x, x, x }
+
+#endif
+
+/* uint8_t vector */
+typedef uint8_t pa_v16qi __attribute__ ((vector_size (PA_UINT8_VECTOR_SIZE * sizeof(uint8_t))));
+typedef union pa_uint8_vector {
+    uint8_t u[PA_UINT8_VECTOR_SIZE];
+    pa_v16qi v;
+#ifdef __SSE2__
+    __m128i m;
+#endif
+} pa_uint8_vector_t;
+
+/* int16_t vector*/
+typedef int16_t pa_v8hi __attribute__ ((vector_size (PA_INT16_VECTOR_SIZE * sizeof(int16_t))));
+typedef union pa_int16_vector {
+    int16_t i[PA_INT16_VECTOR_SIZE];
+    pa_v8hi v;
+#ifdef __SSE2__
+    __m128i m;
+#endif
+} pa_int16_vector_t;
+
+/* int32_t vector */
+typedef int32_t pa_v4si __attribute__ ((vector_size (PA_INT32_VECTOR_SIZE * sizeof(int32_t))));
+typedef union pa_int32_vector {
+    int32_t i[PA_INT32_VECTOR_SIZE];
+    pa_v4si v;
+#ifdef __SSE2__
+    __m128i m;
+#endif
+} pa_int32_vector_t;
+
+/* float vector */
+typedef float pa_v4sf __attribute__ ((vector_size (PA_FLOAT_VECTOR_SIZE * sizeof(float))));
+typedef union pa_float_vector {
+    float f[PA_FLOAT_VECTOR_SIZE];
+    pa_v4sf v;
+#ifdef __SSE2__
+    __m128 m;
+#endif
+} pa_float_vector_t;
+
+#endif
diff --git a/src/tests/rtstutter.c b/src/tests/rtstutter.c
index fc23d959..d8aff342 100644
--- a/src/tests/rtstutter.c
+++ b/src/tests/rtstutter.c
@@ -43,24 +43,28 @@ static int msec_lower, msec_upper;
 static void* work(void *p) PA_GCC_NORETURN;
 
 static void* work(void *p) {
+#ifdef HAVE_PTHREAD_SETAFFINITY_NP
     cpu_set_t mask;
+#endif
     struct sched_param param;
 
-    pa_log_notice("CPU%i: Created thread.", PA_PTR_TO_INT(p));
+    pa_log_notice("CPU%i: Created thread.", PA_PTR_TO_UINT(p));
 
     memset(&param, 0, sizeof(param));
     param.sched_priority = 12;
     pa_assert_se(pthread_setschedparam(pthread_self(), SCHED_FIFO, &param) == 0);
 
+#ifdef HAVE_PTHREAD_SETAFFINITY_NP
     CPU_ZERO(&mask);
-    CPU_SET((size_t) PA_PTR_TO_INT(p), &mask);
+    CPU_SET((size_t) PA_PTR_TO_UINT(p), &mask);
     pa_assert_se(pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == 0);
+#endif
 
     for (;;) {
         struct timespec now, end;
         uint64_t nsec;
 
-        pa_log_notice("CPU%i: Sleeping for 1s", PA_PTR_TO_INT(p));
+        pa_log_notice("CPU%i: Sleeping for 1s", PA_PTR_TO_UINT(p));
         sleep(1);
 
         pa_assert_se(clock_gettime(CLOCK_REALTIME, &end) == 0);
@@ -69,7 +73,7 @@ static void* work(void *p) {
             (uint64_t) ((((double) rand())*(double)(msec_upper-msec_lower)*PA_NSEC_PER_MSEC)/RAND_MAX) +
             (uint64_t) ((uint64_t) msec_lower*PA_NSEC_PER_MSEC);
 
-        pa_log_notice("CPU%i: Freezing for %ims", PA_PTR_TO_INT(p), (int) (nsec/PA_NSEC_PER_MSEC));
+        pa_log_notice("CPU%i: Freezing for %ims", PA_PTR_TO_UINT(p), (int) (nsec/PA_NSEC_PER_MSEC));
 
         end.tv_sec += (time_t) (nsec / PA_NSEC_PER_SEC);
         end.tv_nsec += (long int) (nsec % PA_NSEC_PER_SEC);
@@ -87,7 +91,7 @@ static void* work(void *p) {
 }
 
 int main(int argc, char*argv[]) {
-    int n;
+    unsigned n;
 
     srand((unsigned) time(NULL));
 
@@ -109,7 +113,7 @@ int main(int argc, char*argv[]) {
 
     for (n = 1; n < pa_ncpus(); n++) {
         pthread_t t;
-        pa_assert_se(pthread_create(&t, NULL, work, PA_INT_TO_PTR(n)) == 0);
+        pa_assert_se(pthread_create(&t, NULL, work, PA_UINT_TO_PTR(n)) == 0);
     }
 
     work(PA_INT_TO_PTR(0));
diff --git a/src/tests/vector-test.c b/src/tests/vector-test.c
new file mode 100644
index 00000000..f7344172
--- /dev/null
+++ b/src/tests/vector-test.c
@@ -0,0 +1,83 @@
+/***
+  This file is part of PulseAudio.
+
+  Copyright 2009 Lennart Poettering
+
+  PulseAudio is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as
+  published by the Free Software Foundation; either version 2 of the
+  License, or (at your option) any later version.
+
+  PulseAudio is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with PulseAudio; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+  USA.
+***/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <pulsecore/vector.h>
+#include <pulsecore/log.h>
+
+int main(int argc, char *argv[]) {
+
+#ifdef __SSE2__
+    pa_int16_vector_t input, zero;
+    pa_int32_vector_t unpacked1, unpacked2;
+    pa_int32_vector_t volume1, volume2, volume1_hi, volume1_lo, volume2_hi, volume2_lo, reduce, mask;
+    pa_int16_vector_t output;
+
+    unsigned u;
+
+    zero.v = PA_INT16_VECTOR_MAKE(0);
+    reduce.v = PA_INT32_VECTOR_MAKE(0x10000);
+    volume1.v = volume2.v = PA_INT32_VECTOR_MAKE(0x10000*2+7);
+    mask.v = PA_INT32_VECTOR_MAKE(0xFFFF);
+
+    volume1_lo.m = _mm_and_si128(volume1.m, mask.m);
+    volume2_lo.m = _mm_and_si128(volume2.m, mask.m);
+    volume1_hi.m = _mm_srli_epi32(volume1.m, 16);
+    volume2_hi.m = _mm_srli_epi32(volume2.m, 16);
+
+    input.v = PA_INT16_VECTOR_MAKE(32000);
+
+    for (u = 0; u < PA_INT16_VECTOR_SIZE; u++)
+        pa_log("input=%i\n", input.i[u]);
+
+    unpacked1.m = _mm_unpackhi_epi16(zero.m, input.m);
+    unpacked2.m = _mm_unpacklo_epi16(zero.m, input.m);
+
+    for (u = 0; u < PA_INT32_VECTOR_SIZE; u++)
+        pa_log("unpacked1=%i\n", unpacked1.i[u]);
+
+    unpacked1.v /= reduce.v;
+    unpacked2.v /= reduce.v;
+
+    for (u = 0; u < PA_INT32_VECTOR_SIZE; u++)
+        pa_log("unpacked1=%i\n", unpacked1.i[u]);
+
+    for (u = 0; u < PA_INT32_VECTOR_SIZE; u++)
+        pa_log("volume1=%i\n", volume1.i[u]);
+
+    unpacked1.v = (unpacked1.v * volume1_lo.v) / reduce.v + unpacked1.v * volume1_hi.v;
+    unpacked2.v = (unpacked2.v * volume2_lo.v) / reduce.v + unpacked2.v * volume2_hi.v;
+
+    for (u = 0; u < PA_INT32_VECTOR_SIZE; u++)
+        pa_log("unpacked1=%i\n", unpacked1.i[u]);
+
+    output.m = _mm_packs_epi32(unpacked1.m, unpacked2.m);
+
+    for (u = 0; u < PA_INT16_VECTOR_SIZE; u++)
+        pa_log("output=%i\n", output.i[u]);
+
+#endif
+
+    return 0;
+}