diff options
Diffstat (limited to 'src/pulsecore')
-rw-r--r-- | src/pulsecore/cpu-x86.c | 5 | ||||
-rw-r--r-- | src/pulsecore/cpu-x86.h | 3 | ||||
-rw-r--r-- | src/pulsecore/macro.h | 6 | ||||
-rw-r--r-- | src/pulsecore/remap_mmx.c | 43 | ||||
-rw-r--r-- | src/pulsecore/remap_sse.c | 146 | ||||
-rw-r--r-- | src/pulsecore/sample-util.c | 4 | ||||
-rw-r--r-- | src/pulsecore/sconv.c | 4 | ||||
-rw-r--r-- | src/pulsecore/sconv_sse.c | 235 |
8 files changed, 419 insertions, 27 deletions
diff --git a/src/pulsecore/cpu-x86.c b/src/pulsecore/cpu-x86.c index bc093ec0..1ba9f1a4 100644 --- a/src/pulsecore/cpu-x86.c +++ b/src/pulsecore/cpu-x86.c @@ -115,8 +115,11 @@ void pa_cpu_init_x86 (void) { pa_remap_func_init_mmx (flags); } - if (flags & PA_CPU_X86_SSE) + if (flags & PA_CPU_X86_SSE) { pa_volume_func_init_sse (flags); + pa_remap_func_init_sse (flags); + pa_convert_func_init_sse (flags); + } #endif /* defined (__i386__) || defined (__amd64__) */ } diff --git a/src/pulsecore/cpu-x86.h b/src/pulsecore/cpu-x86.h index b11ef6ea..b40eb5ce 100644 --- a/src/pulsecore/cpu-x86.h +++ b/src/pulsecore/cpu-x86.h @@ -64,5 +64,8 @@ void pa_volume_func_init_mmx(pa_cpu_x86_flag_t flags); void pa_volume_func_init_sse(pa_cpu_x86_flag_t flags); void pa_remap_func_init_mmx(pa_cpu_x86_flag_t flags); +void pa_remap_func_init_sse(pa_cpu_x86_flag_t flags); + +void pa_convert_func_init_sse (pa_cpu_x86_flag_t flags); #endif /* foocpux86hfoo */ diff --git a/src/pulsecore/macro.h b/src/pulsecore/macro.h index 87684ad3..bffcc264 100644 --- a/src/pulsecore/macro.h +++ b/src/pulsecore/macro.h @@ -80,6 +80,12 @@ static inline size_t PA_PAGE_ALIGN(size_t l) { #define PA_ELEMENTSOF(x) (sizeof(x)/sizeof((x)[0])) +#if defined(__GNUC__) + #define PA_DECLARE_ALIGNED(n,t,v) t v __attribute__ ((aligned (n))) +#else + #define PA_DECLARE_ALIGNED(n,t,v) t v +#endif + /* The users of PA_MIN and PA_MAX, PA_CLAMP, PA_ROUND_UP should be * aware that these macros on non-GCC executed code with side effects * twice. It is thus considered misuse to use code with side effects diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c index 00252dac..b5fe82ee 100644 --- a/src/pulsecore/remap_mmx.c +++ b/src/pulsecore/remap_mmx.c @@ -51,7 +51,7 @@ " punpckl"#s" %%mm4, %%mm4 \n\t" \ " punpckh"#s" %%mm5, %%mm5 \n\t" \ " punpckl"#s" %%mm6, %%mm6 \n\t" \ - " punpckh"#s" %%mm7, %%mm7 \n\t" \ + " punpckh"#s" %%mm7, %%mm7 \n\t" #define STORE_SAMPLES \ " movq %%mm0, (%0) \n\t" \ @@ -67,32 +67,31 @@ #define HANDLE_SINGLE(s) \ " movd (%1), %%mm0 \n\t" \ - " movq %%mm0, %%mm1 \n\t" \ " punpckl"#s" %%mm0, %%mm0 \n\t" \ " movq %%mm0, (%0) \n\t" \ " add $4, %1 \n\t" \ " add $8, %0 \n\t" -#define MONO_TO_STEREO(s) \ - " mov %3, %2 \n\t" \ - " sar $3, %2 \n\t" \ - " cmp $0, %2 \n\t" \ - " je 2f \n\t" \ - "1: \n\t" \ - LOAD_SAMPLES \ - UNPACK_SAMPLES(s) \ - STORE_SAMPLES \ - " dec %2 \n\t" \ - " jne 1b \n\t" \ - "2: \n\t" \ - " mov %3, %2 \n\t" \ - " and $7, %2 \n\t" \ - " je 4f \n\t" \ - "3: \n\t" \ - HANDLE_SINGLE(s) \ - " dec %2 \n\t" \ - " jne 3b \n\t" \ - "4: \n\t" \ +#define MONO_TO_STEREO(s) \ + " mov %3, %2 \n\t" \ + " sar $3, %2 \n\t" \ + " cmp $0, %2 \n\t" \ + " je 2f \n\t" \ + "1: \n\t" \ + LOAD_SAMPLES \ + UNPACK_SAMPLES(s) \ + STORE_SAMPLES \ + " dec %2 \n\t" \ + " jne 1b \n\t" \ + "2: \n\t" \ + " mov %3, %2 \n\t" \ + " and $7, %2 \n\t" \ + " je 4f \n\t" \ + "3: \n\t" \ + HANDLE_SINGLE(s) \ + " dec %2 \n\t" \ + " jne 3b \n\t" \ + "4: \n\t" \ " emms \n\t" #if defined (__i386__) || defined (__amd64__) diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c new file mode 100644 index 00000000..97f2476e --- /dev/null +++ b/src/pulsecore/remap_sse.c @@ -0,0 +1,146 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> + +#include <pulse/sample.h> +#include <pulsecore/log.h> +#include <pulsecore/macro.h> + +#include "cpu-x86.h" +#include "remap.h" + +#define LOAD_SAMPLES \ + " movdqu (%1), %%xmm0 \n\t" \ + " movdqu 16(%1), %%xmm2 \n\t" \ + " movdqu 32(%1), %%xmm4 \n\t" \ + " movdqu 48(%1), %%xmm6 \n\t" \ + " movdqa %%xmm0, %%xmm1 \n\t" \ + " movdqa %%xmm2, %%xmm3 \n\t" \ + " movdqa %%xmm4, %%xmm5 \n\t" \ + " movdqa %%xmm6, %%xmm7 \n\t" + +#define UNPACK_SAMPLES(s) \ + " punpckl"#s" %%xmm0, %%xmm0 \n\t" \ + " punpckh"#s" %%xmm1, %%xmm1 \n\t" \ + " punpckl"#s" %%xmm2, %%xmm2 \n\t" \ + " punpckh"#s" %%xmm3, %%xmm3 \n\t" \ + " punpckl"#s" %%xmm4, %%xmm4 \n\t" \ + " punpckh"#s" %%xmm5, %%xmm5 \n\t" \ + " punpckl"#s" %%xmm6, %%xmm6 \n\t" \ + " punpckh"#s" %%xmm7, %%xmm7 \n\t" + +#define STORE_SAMPLES \ + " movdqu %%xmm0, (%0) \n\t" \ + " movdqu %%xmm1, 16(%0) \n\t" \ + " movdqu %%xmm2, 32(%0) \n\t" \ + " movdqu %%xmm3, 48(%0) \n\t" \ + " movdqu %%xmm4, 64(%0) \n\t" \ + " movdqu %%xmm5, 80(%0) \n\t" \ + " movdqu %%xmm6, 96(%0) \n\t" \ + " movdqu %%xmm7, 112(%0) \n\t" \ + " add $64, %1 \n\t" \ + " add $128, %0 \n\t" + +#define HANDLE_SINGLE(s) \ + " movd (%1), %%xmm0 \n\t" \ + " punpckl"#s" %%xmm0, %%xmm0 \n\t" \ + " movq %%xmm0, (%0) \n\t" \ + " add $4, %1 \n\t" \ + " add $8, %0 \n\t" + +#define MONO_TO_STEREO(s) \ + " mov %3, %2 \n\t" \ + " sar $4, %2 \n\t" \ + " cmp $0, %2 \n\t" \ + " je 2f \n\t" \ + "1: \n\t" \ + LOAD_SAMPLES \ + UNPACK_SAMPLES(s) \ + STORE_SAMPLES \ + " dec %2 \n\t" \ + " jne 1b \n\t" \ + "2: \n\t" \ + " mov %3, %2 \n\t" \ + " and $15, %2 \n\t" \ + " je 4f \n\t" \ + "3: \n\t" \ + HANDLE_SINGLE(s) \ + " dec %2 \n\t" \ + " jne 3b \n\t" \ + "4: \n\t" + +static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, unsigned n) { + pa_reg_x86 temp; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + __asm__ __volatile__ ( + MONO_TO_STEREO(dq) /* do doubles to quads */ + : "+r" (dst), "+r" (src), "=&r" (temp) + : "r" ((pa_reg_x86)n) + : "cc" + ); + break; + } + case PA_SAMPLE_S16NE: + { + __asm__ __volatile__ ( + MONO_TO_STEREO(wd) /* do words to doubles */ + : "+r" (dst), "+r" (src), "=&r" (temp) + : "r" ((pa_reg_x86)n) + : "cc" + ); + break; + } + default: + pa_assert_not_reached(); + } +} + +/* set the function that will execute the remapping based on the matrices */ +static void init_remap_sse (pa_remap_t *m) { + unsigned n_oc, n_ic; + + n_oc = m->o_ss->channels; + n_ic = m->i_ss->channels; + + /* find some common channel remappings, fall back to full matrix operation. */ + if (n_ic == 1 && n_oc == 2 && + m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_sse; + pa_log_info("Using SSE mono to stereo remapping"); + } +} + +void pa_remap_func_init_sse (pa_cpu_x86_flag_t flags) { +#if defined (__i386__) || defined (__amd64__) + pa_log_info("Initialising SSE optimized remappers."); + + pa_set_init_remap_func ((pa_init_remap_func_t) init_remap_sse); +#endif /* defined (__i386__) || defined (__amd64__) */ +} diff --git a/src/pulsecore/sample-util.c b/src/pulsecore/sample-util.c index 6e97e5a9..5fae1928 100644 --- a/src/pulsecore/sample-util.c +++ b/src/pulsecore/sample-util.c @@ -137,7 +137,7 @@ static void calc_linear_float_volume(float linear[], const pa_cvolume *volume) { static void calc_linear_integer_stream_volumes(pa_mix_info streams[], unsigned nstreams, const pa_cvolume *volume, const pa_sample_spec *spec) { unsigned k, channel; - float linear[PA_CHANNELS_MAX]; + float linear[PA_CHANNELS_MAX + VOLUME_PADDING]; pa_assert(streams); pa_assert(spec); @@ -156,7 +156,7 @@ static void calc_linear_integer_stream_volumes(pa_mix_info streams[], unsigned n static void calc_linear_float_stream_volumes(pa_mix_info streams[], unsigned nstreams, const pa_cvolume *volume, const pa_sample_spec *spec) { unsigned k, channel; - float linear[PA_CHANNELS_MAX]; + float linear[PA_CHANNELS_MAX + VOLUME_PADDING]; pa_assert(streams); pa_assert(spec); diff --git a/src/pulsecore/sconv.c b/src/pulsecore/sconv.c index d06d6985..301f08b4 100644 --- a/src/pulsecore/sconv.c +++ b/src/pulsecore/sconv.c @@ -52,8 +52,8 @@ static void u8_from_float32ne(unsigned n, const float *a, uint8_t *b) { for (; n > 0; n--, a++, b++) { float v; v = (*a * 127.0) + 128.0; - v = PA_CLAMP_UNLIKELY (v, 0.0, 255.0); - *b = rint (v); + v = PA_CLAMP_UNLIKELY (v, 0.0, 255.0); + *b = rint (v); } } diff --git a/src/pulsecore/sconv_sse.c b/src/pulsecore/sconv_sse.c new file mode 100644 index 00000000..b213d991 --- /dev/null +++ b/src/pulsecore/sconv_sse.c @@ -0,0 +1,235 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdio.h> +#include <stdlib.h> + +#include <pulsecore/g711.h> +#include <pulsecore/macro.h> + +#include "endianmacros.h" + +#include "cpu-x86.h" +#include "sconv.h" + +static pa_convert_func_t func; + +#if defined (__i386__) || defined (__amd64__) + +static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 }; +static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 }; +static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff }; + +static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) { + pa_reg_x86 temp, i; + + __asm__ __volatile__ ( + " movaps %5, %%xmm5 \n\t" + " movaps %6, %%xmm6 \n\t" + " movaps %7, %%xmm7 \n\t" + " xor %0, %0 \n\t" + + " mov %4, %1 \n\t" + " sar $3, %1 \n\t" /* 8 floats at a time */ + " cmp $0, %1 \n\t" + " je 2f \n\t" + + "1: \n\t" + " movups (%2, %0, 2), %%xmm0 \n\t" /* read 8 floats */ + " movups 16(%2, %0, 2), %%xmm2 \n\t" + " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */ + " minps %%xmm5, %%xmm2 \n\t" + " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */ + " maxps %%xmm6, %%xmm2 \n\t" + " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */ + " mulps %%xmm7, %%xmm2 \n\t" + + " cvtps2pi %%xmm0, %%mm0 \n\t" /* low part to int */ + " cvtps2pi %%xmm2, %%mm2 \n\t" + " movhlps %%xmm0, %%xmm0 \n\t" /* bring high part in position */ + " movhlps %%xmm2, %%xmm2 \n\t" + " cvtps2pi %%xmm0, %%mm1 \n\t" /* high part to int */ + " cvtps2pi %%xmm2, %%mm3 \n\t" + + " packssdw %%mm1, %%mm0 \n\t" /* pack parts */ + " packssdw %%mm3, %%mm2 \n\t" + " movq %%mm0, (%3, %0) \n\t" + " movq %%mm2, 8(%3, %0) \n\t" + + " add $16, %0 \n\t" + " dec %1 \n\t" + " jne 1b \n\t" + + "2: \n\t" + " mov %4, %1 \n\t" /* prepare for leftovers */ + " and $15, %1 \n\t" + " je 4f \n\t" + + "3: \n\t" + " movss (%2, %0, 2), %%xmm0 \n\t" + " minss %%xmm5, %%xmm0 \n\t" + " maxss %%xmm6, %%xmm0 \n\t" + " mulss %%xmm7, %%xmm0 \n\t" + " cvtss2si %%xmm0, %4 \n\t" + " movw %w4, (%3, %0) \n\t" + " add $2, %0 \n\t" + " dec %1 \n\t" + " jne 3b \n\t" + + "4: \n\t" + " emms \n\t" + + : "=&r" (i), "=&r" (temp) + : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale) + : "cc", "memory" + ); +} + +static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) { + pa_reg_x86 temp, i; + + __asm__ __volatile__ ( + " movaps %5, %%xmm5 \n\t" + " movaps %6, %%xmm6 \n\t" + " movaps %7, %%xmm7 \n\t" + " xor %0, %0 \n\t" + + " mov %4, %1 \n\t" + " sar $3, %1 \n\t" /* 8 floats at a time */ + " cmp $0, %1 \n\t" + " je 2f \n\t" + + "1: \n\t" + " movups (%2, %0, 2), %%xmm0 \n\t" /* read 8 floats */ + " movups 16(%2, %0, 2), %%xmm2 \n\t" + " minps %%xmm5, %%xmm0 \n\t" /* clamp to 1.0 */ + " minps %%xmm5, %%xmm2 \n\t" + " maxps %%xmm6, %%xmm0 \n\t" /* clamp to -1.0 */ + " maxps %%xmm6, %%xmm2 \n\t" + " mulps %%xmm7, %%xmm0 \n\t" /* *= 0x7fff */ + " mulps %%xmm7, %%xmm2 \n\t" + + " cvtps2dq %%xmm0, %%xmm0 \n\t" + " cvtps2dq %%xmm2, %%xmm2 \n\t" + + " packssdw %%xmm2, %%xmm0 \n\t" + " movdqu %%xmm0, (%3, %0) \n\t" + + " add $16, %0 \n\t" + " dec %1 \n\t" + " jne 1b \n\t" + + "2: \n\t" + " mov %4, %1 \n\t" /* prepare for leftovers */ + " and $15, %1 \n\t" + " je 4f \n\t" + + "3: \n\t" + " movss (%2, %0, 2), %%xmm0 \n\t" + " minss %%xmm5, %%xmm0 \n\t" + " maxss %%xmm6, %%xmm0 \n\t" + " mulss %%xmm7, %%xmm0 \n\t" + " cvtss2si %%xmm0, %4 \n\t" + " movw %w4, (%3, %0) \n\t" + " add $2, %0 \n\t" + " dec %1 \n\t" + " jne 3b \n\t" + + "4: \n\t" + + : "=&r" (i), "=&r" (temp) + : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale) + : "cc", "memory" + ); +} + +#undef RUN_TEST + +#ifdef RUN_TEST +#define SAMPLES 1019 +#define TIMES 1000 + +static void run_test (void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + float floats[SAMPLES]; + int i; + pa_usec_t start, stop; + + printf ("checking SSE %zd\n", sizeof (samples)); + + memset (samples_ref, 0, sizeof (samples_ref)); + memset (samples, 0, sizeof (samples)); + + for (i = 0; i < SAMPLES; i++) { + floats[i] = (rand()/(RAND_MAX+2.2)) - 1.1; + } + + func = pa_get_convert_from_float32ne_function (PA_SAMPLE_S16LE); + func (SAMPLES, floats, samples_ref); + pa_sconv_s16le_from_f32ne_sse2 (SAMPLES, floats, samples); + + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i], + floats[i]); + } + } + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + pa_sconv_s16le_from_f32ne_sse2 (SAMPLES, floats, samples); + } + stop = pa_rtclock_now(); + pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (i = 0; i < TIMES; i++) { + func (SAMPLES, floats, samples_ref); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif +#endif /* defined (__i386__) || defined (__amd64__) */ + + +void pa_convert_func_init_sse (pa_cpu_x86_flag_t flags) { +#if defined (__i386__) || defined (__amd64__) + pa_log_info("Initialising SSE optimized conversions."); + +#ifdef RUN_TEST + run_test (); +#endif + + if (flags & PA_CPU_X86_SSE2) + pa_set_convert_from_float32ne_function (PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2); + else + pa_set_convert_from_float32ne_function (PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse); + +#endif /* defined (__i386__) || defined (__amd64__) */ +} + |